1 | /* |
2 | * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. |
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 | * |
5 | * This code is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License version 2 only, as |
7 | * published by the Free Software Foundation. |
8 | * |
9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
12 | * version 2 for more details (a copy is included in the LICENSE file that |
13 | * accompanied this code). |
14 | * |
15 | * You should have received a copy of the GNU General Public License version |
16 | * 2 along with this work; if not, write to the Free Software Foundation, |
17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
18 | * |
19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
20 | * or visit www.oracle.com if you need additional information or have any |
21 | * questions. |
22 | * |
23 | */ |
24 | |
25 | #include "precompiled.hpp" |
26 | #include "jvm.h" |
27 | #include "asm/macroAssembler.hpp" |
28 | #include "asm/macroAssembler.inline.hpp" |
29 | #include "logging/log.hpp" |
30 | #include "logging/logStream.hpp" |
31 | #include "memory/resourceArea.hpp" |
32 | #include "runtime/java.hpp" |
33 | #include "runtime/os.hpp" |
34 | #include "runtime/stubCodeGenerator.hpp" |
35 | #include "utilities/virtualizationSupport.hpp" |
36 | #include "vm_version_x86.hpp" |
37 | |
38 | |
39 | int VM_Version::_cpu; |
40 | int VM_Version::_model; |
41 | int VM_Version::_stepping; |
42 | VM_Version::CpuidInfo VM_Version::_cpuid_info = { 0, }; |
43 | |
44 | // Address of instruction which causes SEGV |
45 | address VM_Version::_cpuinfo_segv_addr = 0; |
46 | // Address of instruction after the one which causes SEGV |
47 | address VM_Version::_cpuinfo_cont_addr = 0; |
48 | |
49 | static BufferBlob* stub_blob; |
50 | static const int stub_size = 1100; |
51 | |
52 | extern "C" { |
53 | typedef void (*get_cpu_info_stub_t)(void*); |
54 | } |
55 | static get_cpu_info_stub_t get_cpu_info_stub = NULL; |
56 | |
57 | |
58 | class VM_Version_StubGenerator: public StubCodeGenerator { |
59 | public: |
60 | |
61 | VM_Version_StubGenerator(CodeBuffer *c) : StubCodeGenerator(c) {} |
62 | |
63 | address generate_get_cpu_info() { |
64 | // Flags to test CPU type. |
65 | const uint32_t HS_EFL_AC = 0x40000; |
66 | const uint32_t HS_EFL_ID = 0x200000; |
67 | // Values for when we don't have a CPUID instruction. |
68 | const int CPU_FAMILY_SHIFT = 8; |
69 | const uint32_t CPU_FAMILY_386 = (3 << CPU_FAMILY_SHIFT); |
70 | const uint32_t CPU_FAMILY_486 = (4 << CPU_FAMILY_SHIFT); |
71 | bool use_evex = FLAG_IS_DEFAULT(UseAVX) || (UseAVX > 2); |
72 | |
73 | Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4; |
74 | Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, ext_cpuid8, done, wrapup; |
75 | Label legacy_setup, save_restore_except, legacy_save_restore, start_simd_check; |
76 | |
77 | StubCodeMark mark(this, "VM_Version" , "get_cpu_info_stub" ); |
78 | # define __ _masm-> |
79 | |
80 | address start = __ pc(); |
81 | |
82 | // |
83 | // void get_cpu_info(VM_Version::CpuidInfo* cpuid_info); |
84 | // |
85 | // LP64: rcx and rdx are first and second argument registers on windows |
86 | |
87 | __ push(rbp); |
88 | #ifdef _LP64 |
89 | __ mov(rbp, c_rarg0); // cpuid_info address |
90 | #else |
91 | __ movptr(rbp, Address(rsp, 8)); // cpuid_info address |
92 | #endif |
93 | __ push(rbx); |
94 | __ push(rsi); |
95 | __ pushf(); // preserve rbx, and flags |
96 | __ pop(rax); |
97 | __ push(rax); |
98 | __ mov(rcx, rax); |
99 | // |
100 | // if we are unable to change the AC flag, we have a 386 |
101 | // |
102 | __ xorl(rax, HS_EFL_AC); |
103 | __ push(rax); |
104 | __ popf(); |
105 | __ pushf(); |
106 | __ pop(rax); |
107 | __ cmpptr(rax, rcx); |
108 | __ jccb(Assembler::notEqual, detect_486); |
109 | |
110 | __ movl(rax, CPU_FAMILY_386); |
111 | __ movl(Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())), rax); |
112 | __ jmp(done); |
113 | |
114 | // |
115 | // If we are unable to change the ID flag, we have a 486 which does |
116 | // not support the "cpuid" instruction. |
117 | // |
118 | __ bind(detect_486); |
119 | __ mov(rax, rcx); |
120 | __ xorl(rax, HS_EFL_ID); |
121 | __ push(rax); |
122 | __ popf(); |
123 | __ pushf(); |
124 | __ pop(rax); |
125 | __ cmpptr(rcx, rax); |
126 | __ jccb(Assembler::notEqual, detect_586); |
127 | |
128 | __ bind(cpu486); |
129 | __ movl(rax, CPU_FAMILY_486); |
130 | __ movl(Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())), rax); |
131 | __ jmp(done); |
132 | |
133 | // |
134 | // At this point, we have a chip which supports the "cpuid" instruction |
135 | // |
136 | __ bind(detect_586); |
137 | __ xorl(rax, rax); |
138 | __ cpuid(); |
139 | __ orl(rax, rax); |
140 | __ jcc(Assembler::equal, cpu486); // if cpuid doesn't support an input |
141 | // value of at least 1, we give up and |
142 | // assume a 486 |
143 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset()))); |
144 | __ movl(Address(rsi, 0), rax); |
145 | __ movl(Address(rsi, 4), rbx); |
146 | __ movl(Address(rsi, 8), rcx); |
147 | __ movl(Address(rsi,12), rdx); |
148 | |
149 | __ cmpl(rax, 0xa); // Is cpuid(0xB) supported? |
150 | __ jccb(Assembler::belowEqual, std_cpuid4); |
151 | |
152 | // |
153 | // cpuid(0xB) Processor Topology |
154 | // |
155 | __ movl(rax, 0xb); |
156 | __ xorl(rcx, rcx); // Threads level |
157 | __ cpuid(); |
158 | |
159 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::tpl_cpuidB0_offset()))); |
160 | __ movl(Address(rsi, 0), rax); |
161 | __ movl(Address(rsi, 4), rbx); |
162 | __ movl(Address(rsi, 8), rcx); |
163 | __ movl(Address(rsi,12), rdx); |
164 | |
165 | __ movl(rax, 0xb); |
166 | __ movl(rcx, 1); // Cores level |
167 | __ cpuid(); |
168 | __ push(rax); |
169 | __ andl(rax, 0x1f); // Determine if valid topology level |
170 | __ orl(rax, rbx); // eax[4:0] | ebx[0:15] == 0 indicates invalid level |
171 | __ andl(rax, 0xffff); |
172 | __ pop(rax); |
173 | __ jccb(Assembler::equal, std_cpuid4); |
174 | |
175 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::tpl_cpuidB1_offset()))); |
176 | __ movl(Address(rsi, 0), rax); |
177 | __ movl(Address(rsi, 4), rbx); |
178 | __ movl(Address(rsi, 8), rcx); |
179 | __ movl(Address(rsi,12), rdx); |
180 | |
181 | __ movl(rax, 0xb); |
182 | __ movl(rcx, 2); // Packages level |
183 | __ cpuid(); |
184 | __ push(rax); |
185 | __ andl(rax, 0x1f); // Determine if valid topology level |
186 | __ orl(rax, rbx); // eax[4:0] | ebx[0:15] == 0 indicates invalid level |
187 | __ andl(rax, 0xffff); |
188 | __ pop(rax); |
189 | __ jccb(Assembler::equal, std_cpuid4); |
190 | |
191 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::tpl_cpuidB2_offset()))); |
192 | __ movl(Address(rsi, 0), rax); |
193 | __ movl(Address(rsi, 4), rbx); |
194 | __ movl(Address(rsi, 8), rcx); |
195 | __ movl(Address(rsi,12), rdx); |
196 | |
197 | // |
198 | // cpuid(0x4) Deterministic cache params |
199 | // |
200 | __ bind(std_cpuid4); |
201 | __ movl(rax, 4); |
202 | __ cmpl(rax, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset()))); // Is cpuid(0x4) supported? |
203 | __ jccb(Assembler::greater, std_cpuid1); |
204 | |
205 | __ xorl(rcx, rcx); // L1 cache |
206 | __ cpuid(); |
207 | __ push(rax); |
208 | __ andl(rax, 0x1f); // Determine if valid cache parameters used |
209 | __ orl(rax, rax); // eax[4:0] == 0 indicates invalid cache |
210 | __ pop(rax); |
211 | __ jccb(Assembler::equal, std_cpuid1); |
212 | |
213 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::dcp_cpuid4_offset()))); |
214 | __ movl(Address(rsi, 0), rax); |
215 | __ movl(Address(rsi, 4), rbx); |
216 | __ movl(Address(rsi, 8), rcx); |
217 | __ movl(Address(rsi,12), rdx); |
218 | |
219 | // |
220 | // Standard cpuid(0x1) |
221 | // |
222 | __ bind(std_cpuid1); |
223 | __ movl(rax, 1); |
224 | __ cpuid(); |
225 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset()))); |
226 | __ movl(Address(rsi, 0), rax); |
227 | __ movl(Address(rsi, 4), rbx); |
228 | __ movl(Address(rsi, 8), rcx); |
229 | __ movl(Address(rsi,12), rdx); |
230 | |
231 | // |
232 | // Check if OS has enabled XGETBV instruction to access XCR0 |
233 | // (OSXSAVE feature flag) and CPU supports AVX |
234 | // |
235 | __ andl(rcx, 0x18000000); // cpuid1 bits osxsave | avx |
236 | __ cmpl(rcx, 0x18000000); |
237 | __ jccb(Assembler::notEqual, sef_cpuid); // jump if AVX is not supported |
238 | |
239 | // |
240 | // XCR0, XFEATURE_ENABLED_MASK register |
241 | // |
242 | __ xorl(rcx, rcx); // zero for XCR0 register |
243 | __ xgetbv(); |
244 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); |
245 | __ movl(Address(rsi, 0), rax); |
246 | __ movl(Address(rsi, 4), rdx); |
247 | |
248 | // |
249 | // cpuid(0x7) Structured Extended Features |
250 | // |
251 | __ bind(sef_cpuid); |
252 | __ movl(rax, 7); |
253 | __ cmpl(rax, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset()))); // Is cpuid(0x7) supported? |
254 | __ jccb(Assembler::greater, ext_cpuid); |
255 | |
256 | __ xorl(rcx, rcx); |
257 | __ cpuid(); |
258 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset()))); |
259 | __ movl(Address(rsi, 0), rax); |
260 | __ movl(Address(rsi, 4), rbx); |
261 | __ movl(Address(rsi, 8), rcx); |
262 | __ movl(Address(rsi, 12), rdx); |
263 | |
264 | // |
265 | // Extended cpuid(0x80000000) |
266 | // |
267 | __ bind(ext_cpuid); |
268 | __ movl(rax, 0x80000000); |
269 | __ cpuid(); |
270 | __ cmpl(rax, 0x80000000); // Is cpuid(0x80000001) supported? |
271 | __ jcc(Assembler::belowEqual, done); |
272 | __ cmpl(rax, 0x80000004); // Is cpuid(0x80000005) supported? |
273 | __ jcc(Assembler::belowEqual, ext_cpuid1); |
274 | __ cmpl(rax, 0x80000006); // Is cpuid(0x80000007) supported? |
275 | __ jccb(Assembler::belowEqual, ext_cpuid5); |
276 | __ cmpl(rax, 0x80000007); // Is cpuid(0x80000008) supported? |
277 | __ jccb(Assembler::belowEqual, ext_cpuid7); |
278 | __ cmpl(rax, 0x80000008); // Is cpuid(0x80000009 and above) supported? |
279 | __ jccb(Assembler::belowEqual, ext_cpuid8); |
280 | __ cmpl(rax, 0x8000001E); // Is cpuid(0x8000001E) supported? |
281 | __ jccb(Assembler::below, ext_cpuid8); |
282 | // |
283 | // Extended cpuid(0x8000001E) |
284 | // |
285 | __ movl(rax, 0x8000001E); |
286 | __ cpuid(); |
287 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::ext_cpuid1E_offset()))); |
288 | __ movl(Address(rsi, 0), rax); |
289 | __ movl(Address(rsi, 4), rbx); |
290 | __ movl(Address(rsi, 8), rcx); |
291 | __ movl(Address(rsi,12), rdx); |
292 | |
293 | // |
294 | // Extended cpuid(0x80000008) |
295 | // |
296 | __ bind(ext_cpuid8); |
297 | __ movl(rax, 0x80000008); |
298 | __ cpuid(); |
299 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::ext_cpuid8_offset()))); |
300 | __ movl(Address(rsi, 0), rax); |
301 | __ movl(Address(rsi, 4), rbx); |
302 | __ movl(Address(rsi, 8), rcx); |
303 | __ movl(Address(rsi,12), rdx); |
304 | |
305 | // |
306 | // Extended cpuid(0x80000007) |
307 | // |
308 | __ bind(ext_cpuid7); |
309 | __ movl(rax, 0x80000007); |
310 | __ cpuid(); |
311 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::ext_cpuid7_offset()))); |
312 | __ movl(Address(rsi, 0), rax); |
313 | __ movl(Address(rsi, 4), rbx); |
314 | __ movl(Address(rsi, 8), rcx); |
315 | __ movl(Address(rsi,12), rdx); |
316 | |
317 | // |
318 | // Extended cpuid(0x80000005) |
319 | // |
320 | __ bind(ext_cpuid5); |
321 | __ movl(rax, 0x80000005); |
322 | __ cpuid(); |
323 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::ext_cpuid5_offset()))); |
324 | __ movl(Address(rsi, 0), rax); |
325 | __ movl(Address(rsi, 4), rbx); |
326 | __ movl(Address(rsi, 8), rcx); |
327 | __ movl(Address(rsi,12), rdx); |
328 | |
329 | // |
330 | // Extended cpuid(0x80000001) |
331 | // |
332 | __ bind(ext_cpuid1); |
333 | __ movl(rax, 0x80000001); |
334 | __ cpuid(); |
335 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::ext_cpuid1_offset()))); |
336 | __ movl(Address(rsi, 0), rax); |
337 | __ movl(Address(rsi, 4), rbx); |
338 | __ movl(Address(rsi, 8), rcx); |
339 | __ movl(Address(rsi,12), rdx); |
340 | |
341 | // |
342 | // Check if OS has enabled XGETBV instruction to access XCR0 |
343 | // (OSXSAVE feature flag) and CPU supports AVX |
344 | // |
345 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset()))); |
346 | __ movl(rcx, 0x18000000); // cpuid1 bits osxsave | avx |
347 | __ andl(rcx, Address(rsi, 8)); // cpuid1 bits osxsave | avx |
348 | __ cmpl(rcx, 0x18000000); |
349 | __ jccb(Assembler::notEqual, done); // jump if AVX is not supported |
350 | |
351 | __ movl(rax, 0x6); |
352 | __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm |
353 | __ cmpl(rax, 0x6); |
354 | __ jccb(Assembler::equal, start_simd_check); // return if AVX is not supported |
355 | |
356 | // we need to bridge farther than imm8, so we use this island as a thunk |
357 | __ bind(done); |
358 | __ jmp(wrapup); |
359 | |
360 | __ bind(start_simd_check); |
361 | // |
362 | // Some OSs have a bug when upper 128/256bits of YMM/ZMM |
363 | // registers are not restored after a signal processing. |
364 | // Generate SEGV here (reference through NULL) |
365 | // and check upper YMM/ZMM bits after it. |
366 | // |
367 | intx saved_useavx = UseAVX; |
368 | intx saved_usesse = UseSSE; |
369 | // check _cpuid_info.sef_cpuid7_ebx.bits.avx512f |
370 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset()))); |
371 | __ movl(rax, 0x10000); |
372 | __ andl(rax, Address(rsi, 4)); // xcr0 bits sse | ymm |
373 | __ cmpl(rax, 0x10000); |
374 | __ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported |
375 | // check _cpuid_info.xem_xcr0_eax.bits.opmask |
376 | // check _cpuid_info.xem_xcr0_eax.bits.zmm512 |
377 | // check _cpuid_info.xem_xcr0_eax.bits.zmm32 |
378 | __ movl(rax, 0xE0); |
379 | __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm |
380 | __ cmpl(rax, 0xE0); |
381 | __ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported |
382 | |
383 | // If UseAVX is unitialized or is set by the user to include EVEX |
384 | if (use_evex) { |
385 | // EVEX setup: run in lowest evex mode |
386 | VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts |
387 | UseAVX = 3; |
388 | UseSSE = 2; |
389 | #ifdef _WINDOWS |
390 | // xmm5-xmm15 are not preserved by caller on windows |
391 | // https://msdn.microsoft.com/en-us/library/9z1stfyw.aspx |
392 | __ subptr(rsp, 64); |
393 | __ evmovdqul(Address(rsp, 0), xmm7, Assembler::AVX_512bit); |
394 | #ifdef _LP64 |
395 | __ subptr(rsp, 64); |
396 | __ evmovdqul(Address(rsp, 0), xmm8, Assembler::AVX_512bit); |
397 | __ subptr(rsp, 64); |
398 | __ evmovdqul(Address(rsp, 0), xmm31, Assembler::AVX_512bit); |
399 | #endif // _LP64 |
400 | #endif // _WINDOWS |
401 | |
402 | // load value into all 64 bytes of zmm7 register |
403 | __ movl(rcx, VM_Version::ymm_test_value()); |
404 | __ movdl(xmm0, rcx); |
405 | __ vpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit); |
406 | __ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit); |
407 | #ifdef _LP64 |
408 | __ evmovdqul(xmm8, xmm0, Assembler::AVX_512bit); |
409 | __ evmovdqul(xmm31, xmm0, Assembler::AVX_512bit); |
410 | #endif |
411 | VM_Version::clean_cpuFeatures(); |
412 | __ jmp(save_restore_except); |
413 | } |
414 | |
415 | __ bind(legacy_setup); |
416 | // AVX setup |
417 | VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts |
418 | UseAVX = 1; |
419 | UseSSE = 2; |
420 | #ifdef _WINDOWS |
421 | __ subptr(rsp, 32); |
422 | __ vmovdqu(Address(rsp, 0), xmm7); |
423 | #ifdef _LP64 |
424 | __ subptr(rsp, 32); |
425 | __ vmovdqu(Address(rsp, 0), xmm8); |
426 | __ subptr(rsp, 32); |
427 | __ vmovdqu(Address(rsp, 0), xmm15); |
428 | #endif // _LP64 |
429 | #endif // _WINDOWS |
430 | |
431 | // load value into all 32 bytes of ymm7 register |
432 | __ movl(rcx, VM_Version::ymm_test_value()); |
433 | |
434 | __ movdl(xmm0, rcx); |
435 | __ pshufd(xmm0, xmm0, 0x00); |
436 | __ vinsertf128_high(xmm0, xmm0); |
437 | __ vmovdqu(xmm7, xmm0); |
438 | #ifdef _LP64 |
439 | __ vmovdqu(xmm8, xmm0); |
440 | __ vmovdqu(xmm15, xmm0); |
441 | #endif |
442 | VM_Version::clean_cpuFeatures(); |
443 | |
444 | __ bind(save_restore_except); |
445 | __ xorl(rsi, rsi); |
446 | VM_Version::set_cpuinfo_segv_addr(__ pc()); |
447 | // Generate SEGV |
448 | __ movl(rax, Address(rsi, 0)); |
449 | |
450 | VM_Version::set_cpuinfo_cont_addr(__ pc()); |
451 | // Returns here after signal. Save xmm0 to check it later. |
452 | |
453 | // check _cpuid_info.sef_cpuid7_ebx.bits.avx512f |
454 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset()))); |
455 | __ movl(rax, 0x10000); |
456 | __ andl(rax, Address(rsi, 4)); |
457 | __ cmpl(rax, 0x10000); |
458 | __ jcc(Assembler::notEqual, legacy_save_restore); |
459 | // check _cpuid_info.xem_xcr0_eax.bits.opmask |
460 | // check _cpuid_info.xem_xcr0_eax.bits.zmm512 |
461 | // check _cpuid_info.xem_xcr0_eax.bits.zmm32 |
462 | __ movl(rax, 0xE0); |
463 | __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm |
464 | __ cmpl(rax, 0xE0); |
465 | __ jcc(Assembler::notEqual, legacy_save_restore); |
466 | |
467 | // If UseAVX is unitialized or is set by the user to include EVEX |
468 | if (use_evex) { |
469 | // EVEX check: run in lowest evex mode |
470 | VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts |
471 | UseAVX = 3; |
472 | UseSSE = 2; |
473 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset()))); |
474 | __ evmovdqul(Address(rsi, 0), xmm0, Assembler::AVX_512bit); |
475 | __ evmovdqul(Address(rsi, 64), xmm7, Assembler::AVX_512bit); |
476 | #ifdef _LP64 |
477 | __ evmovdqul(Address(rsi, 128), xmm8, Assembler::AVX_512bit); |
478 | __ evmovdqul(Address(rsi, 192), xmm31, Assembler::AVX_512bit); |
479 | #endif |
480 | |
481 | #ifdef _WINDOWS |
482 | #ifdef _LP64 |
483 | __ evmovdqul(xmm31, Address(rsp, 0), Assembler::AVX_512bit); |
484 | __ addptr(rsp, 64); |
485 | __ evmovdqul(xmm8, Address(rsp, 0), Assembler::AVX_512bit); |
486 | __ addptr(rsp, 64); |
487 | #endif // _LP64 |
488 | __ evmovdqul(xmm7, Address(rsp, 0), Assembler::AVX_512bit); |
489 | __ addptr(rsp, 64); |
490 | #endif // _WINDOWS |
491 | generate_vzeroupper(wrapup); |
492 | VM_Version::clean_cpuFeatures(); |
493 | UseAVX = saved_useavx; |
494 | UseSSE = saved_usesse; |
495 | __ jmp(wrapup); |
496 | } |
497 | |
498 | __ bind(legacy_save_restore); |
499 | // AVX check |
500 | VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts |
501 | UseAVX = 1; |
502 | UseSSE = 2; |
503 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset()))); |
504 | __ vmovdqu(Address(rsi, 0), xmm0); |
505 | __ vmovdqu(Address(rsi, 32), xmm7); |
506 | #ifdef _LP64 |
507 | __ vmovdqu(Address(rsi, 64), xmm8); |
508 | __ vmovdqu(Address(rsi, 96), xmm15); |
509 | #endif |
510 | |
511 | #ifdef _WINDOWS |
512 | #ifdef _LP64 |
513 | __ vmovdqu(xmm15, Address(rsp, 0)); |
514 | __ addptr(rsp, 32); |
515 | __ vmovdqu(xmm8, Address(rsp, 0)); |
516 | __ addptr(rsp, 32); |
517 | #endif // _LP64 |
518 | __ vmovdqu(xmm7, Address(rsp, 0)); |
519 | __ addptr(rsp, 32); |
520 | #endif // _WINDOWS |
521 | generate_vzeroupper(wrapup); |
522 | VM_Version::clean_cpuFeatures(); |
523 | UseAVX = saved_useavx; |
524 | UseSSE = saved_usesse; |
525 | |
526 | __ bind(wrapup); |
527 | __ popf(); |
528 | __ pop(rsi); |
529 | __ pop(rbx); |
530 | __ pop(rbp); |
531 | __ ret(0); |
532 | |
533 | # undef __ |
534 | |
535 | return start; |
536 | }; |
537 | void generate_vzeroupper(Label& L_wrapup) { |
538 | # define __ _masm-> |
539 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset()))); |
540 | __ cmpl(Address(rsi, 4), 0x756e6547); // 'uneG' |
541 | __ jcc(Assembler::notEqual, L_wrapup); |
542 | __ movl(rcx, 0x0FFF0FF0); |
543 | __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset()))); |
544 | __ andl(rcx, Address(rsi, 0)); |
545 | __ cmpl(rcx, 0x00050670); // If it is Xeon Phi 3200/5200/7200 |
546 | __ jcc(Assembler::equal, L_wrapup); |
547 | __ cmpl(rcx, 0x00080650); // If it is Future Xeon Phi |
548 | __ jcc(Assembler::equal, L_wrapup); |
549 | __ vzeroupper(); |
550 | # undef __ |
551 | } |
552 | }; |
553 | |
554 | void VM_Version::get_processor_features() { |
555 | |
556 | _cpu = 4; // 486 by default |
557 | _model = 0; |
558 | _stepping = 0; |
559 | _features = 0; |
560 | _logical_processors_per_package = 1; |
561 | // i486 internal cache is both I&D and has a 16-byte line size |
562 | _L1_data_cache_line_size = 16; |
563 | |
564 | // Get raw processor info |
565 | |
566 | get_cpu_info_stub(&_cpuid_info); |
567 | |
568 | assert_is_initialized(); |
569 | _cpu = extended_cpu_family(); |
570 | _model = extended_cpu_model(); |
571 | _stepping = cpu_stepping(); |
572 | |
573 | if (cpu_family() > 4) { // it supports CPUID |
574 | _features = feature_flags(); |
575 | // Logical processors are only available on P4s and above, |
576 | // and only if hyperthreading is available. |
577 | _logical_processors_per_package = logical_processor_count(); |
578 | _L1_data_cache_line_size = L1_line_size(); |
579 | } |
580 | |
581 | _supports_cx8 = supports_cmpxchg8(); |
582 | // xchg and xadd instructions |
583 | _supports_atomic_getset4 = true; |
584 | _supports_atomic_getadd4 = true; |
585 | LP64_ONLY(_supports_atomic_getset8 = true); |
586 | LP64_ONLY(_supports_atomic_getadd8 = true); |
587 | |
588 | #ifdef _LP64 |
589 | // OS should support SSE for x64 and hardware should support at least SSE2. |
590 | if (!VM_Version::supports_sse2()) { |
591 | vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported" ); |
592 | } |
593 | // in 64 bit the use of SSE2 is the minimum |
594 | if (UseSSE < 2) UseSSE = 2; |
595 | #endif |
596 | |
597 | #ifdef AMD64 |
598 | // flush_icache_stub have to be generated first. |
599 | // That is why Icache line size is hard coded in ICache class, |
600 | // see icache_x86.hpp. It is also the reason why we can't use |
601 | // clflush instruction in 32-bit VM since it could be running |
602 | // on CPU which does not support it. |
603 | // |
604 | // The only thing we can do is to verify that flushed |
605 | // ICache::line_size has correct value. |
606 | guarantee(_cpuid_info.std_cpuid1_edx.bits.clflush != 0, "clflush is not supported" ); |
607 | // clflush_size is size in quadwords (8 bytes). |
608 | guarantee(_cpuid_info.std_cpuid1_ebx.bits.clflush_size == 8, "such clflush size is not supported" ); |
609 | #endif |
610 | |
611 | // If the OS doesn't support SSE, we can't use this feature even if the HW does |
612 | if (!os::supports_sse()) |
613 | _features &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2); |
614 | |
615 | if (UseSSE < 4) { |
616 | _features &= ~CPU_SSE4_1; |
617 | _features &= ~CPU_SSE4_2; |
618 | } |
619 | |
620 | if (UseSSE < 3) { |
621 | _features &= ~CPU_SSE3; |
622 | _features &= ~CPU_SSSE3; |
623 | _features &= ~CPU_SSE4A; |
624 | } |
625 | |
626 | if (UseSSE < 2) |
627 | _features &= ~CPU_SSE2; |
628 | |
629 | if (UseSSE < 1) |
630 | _features &= ~CPU_SSE; |
631 | |
632 | //since AVX instructions is slower than SSE in some ZX cpus, force USEAVX=0. |
633 | if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7))) { |
634 | UseAVX = 0; |
635 | } |
636 | |
637 | // first try initial setting and detect what we can support |
638 | int use_avx_limit = 0; |
639 | if (UseAVX > 0) { |
640 | if (UseAVX > 2 && supports_evex()) { |
641 | use_avx_limit = 3; |
642 | } else if (UseAVX > 1 && supports_avx2()) { |
643 | use_avx_limit = 2; |
644 | } else if (UseAVX > 0 && supports_avx()) { |
645 | use_avx_limit = 1; |
646 | } else { |
647 | use_avx_limit = 0; |
648 | } |
649 | } |
650 | if (FLAG_IS_DEFAULT(UseAVX)) { |
651 | FLAG_SET_DEFAULT(UseAVX, use_avx_limit); |
652 | } else if (UseAVX > use_avx_limit) { |
653 | warning("UseAVX=%d is not supported on this CPU, setting it to UseAVX=%d" , (int) UseAVX, use_avx_limit); |
654 | FLAG_SET_DEFAULT(UseAVX, use_avx_limit); |
655 | } else if (UseAVX < 0) { |
656 | warning("UseAVX=%d is not valid, setting it to UseAVX=0" , (int) UseAVX); |
657 | FLAG_SET_DEFAULT(UseAVX, 0); |
658 | } |
659 | |
660 | if (UseAVX < 3) { |
661 | _features &= ~CPU_AVX512F; |
662 | _features &= ~CPU_AVX512DQ; |
663 | _features &= ~CPU_AVX512CD; |
664 | _features &= ~CPU_AVX512BW; |
665 | _features &= ~CPU_AVX512VL; |
666 | _features &= ~CPU_AVX512_VPOPCNTDQ; |
667 | _features &= ~CPU_VPCLMULQDQ; |
668 | _features &= ~CPU_VAES; |
669 | } |
670 | |
671 | if (UseAVX < 2) |
672 | _features &= ~CPU_AVX2; |
673 | |
674 | if (UseAVX < 1) { |
675 | _features &= ~CPU_AVX; |
676 | _features &= ~CPU_VZEROUPPER; |
677 | } |
678 | |
679 | if (logical_processors_per_package() == 1) { |
680 | // HT processor could be installed on a system which doesn't support HT. |
681 | _features &= ~CPU_HT; |
682 | } |
683 | |
684 | if (is_intel()) { // Intel cpus specific settings |
685 | if (is_knights_family()) { |
686 | _features &= ~CPU_VZEROUPPER; |
687 | } |
688 | } |
689 | |
690 | char buf[256]; |
691 | jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s" , |
692 | cores_per_cpu(), threads_per_core(), |
693 | cpu_family(), _model, _stepping, |
694 | (supports_cmov() ? ", cmov" : "" ), |
695 | (supports_cmpxchg8() ? ", cx8" : "" ), |
696 | (supports_fxsr() ? ", fxsr" : "" ), |
697 | (supports_mmx() ? ", mmx" : "" ), |
698 | (supports_sse() ? ", sse" : "" ), |
699 | (supports_sse2() ? ", sse2" : "" ), |
700 | (supports_sse3() ? ", sse3" : "" ), |
701 | (supports_ssse3()? ", ssse3" : "" ), |
702 | (supports_sse4_1() ? ", sse4.1" : "" ), |
703 | (supports_sse4_2() ? ", sse4.2" : "" ), |
704 | (supports_popcnt() ? ", popcnt" : "" ), |
705 | (supports_avx() ? ", avx" : "" ), |
706 | (supports_avx2() ? ", avx2" : "" ), |
707 | (supports_aes() ? ", aes" : "" ), |
708 | (supports_clmul() ? ", clmul" : "" ), |
709 | (supports_erms() ? ", erms" : "" ), |
710 | (supports_rtm() ? ", rtm" : "" ), |
711 | (supports_mmx_ext() ? ", mmxext" : "" ), |
712 | (supports_3dnow_prefetch() ? ", 3dnowpref" : "" ), |
713 | (supports_lzcnt() ? ", lzcnt" : "" ), |
714 | (supports_sse4a() ? ", sse4a" : "" ), |
715 | (supports_ht() ? ", ht" : "" ), |
716 | (supports_tsc() ? ", tsc" : "" ), |
717 | (supports_tscinv_bit() ? ", tscinvbit" : "" ), |
718 | (supports_tscinv() ? ", tscinv" : "" ), |
719 | (supports_bmi1() ? ", bmi1" : "" ), |
720 | (supports_bmi2() ? ", bmi2" : "" ), |
721 | (supports_adx() ? ", adx" : "" ), |
722 | (supports_evex() ? ", evex" : "" ), |
723 | (supports_sha() ? ", sha" : "" ), |
724 | (supports_fma() ? ", fma" : "" )); |
725 | _features_string = os::strdup(buf); |
726 | |
727 | // UseSSE is set to the smaller of what hardware supports and what |
728 | // the command line requires. I.e., you cannot set UseSSE to 2 on |
729 | // older Pentiums which do not support it. |
730 | int use_sse_limit = 0; |
731 | if (UseSSE > 0) { |
732 | if (UseSSE > 3 && supports_sse4_1()) { |
733 | use_sse_limit = 4; |
734 | } else if (UseSSE > 2 && supports_sse3()) { |
735 | use_sse_limit = 3; |
736 | } else if (UseSSE > 1 && supports_sse2()) { |
737 | use_sse_limit = 2; |
738 | } else if (UseSSE > 0 && supports_sse()) { |
739 | use_sse_limit = 1; |
740 | } else { |
741 | use_sse_limit = 0; |
742 | } |
743 | } |
744 | if (FLAG_IS_DEFAULT(UseSSE)) { |
745 | FLAG_SET_DEFAULT(UseSSE, use_sse_limit); |
746 | } else if (UseSSE > use_sse_limit) { |
747 | warning("UseSSE=%d is not supported on this CPU, setting it to UseSSE=%d" , (int) UseSSE, use_sse_limit); |
748 | FLAG_SET_DEFAULT(UseSSE, use_sse_limit); |
749 | } else if (UseSSE < 0) { |
750 | warning("UseSSE=%d is not valid, setting it to UseSSE=0" , (int) UseSSE); |
751 | FLAG_SET_DEFAULT(UseSSE, 0); |
752 | } |
753 | |
754 | // Use AES instructions if available. |
755 | if (supports_aes()) { |
756 | if (FLAG_IS_DEFAULT(UseAES)) { |
757 | FLAG_SET_DEFAULT(UseAES, true); |
758 | } |
759 | if (!UseAES) { |
760 | if (UseAESIntrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) { |
761 | warning("AES intrinsics require UseAES flag to be enabled. Intrinsics will be disabled." ); |
762 | } |
763 | FLAG_SET_DEFAULT(UseAESIntrinsics, false); |
764 | } else { |
765 | if (UseSSE > 2) { |
766 | if (FLAG_IS_DEFAULT(UseAESIntrinsics)) { |
767 | FLAG_SET_DEFAULT(UseAESIntrinsics, true); |
768 | } |
769 | } else { |
770 | // The AES intrinsic stubs require AES instruction support (of course) |
771 | // but also require sse3 mode or higher for instructions it use. |
772 | if (UseAESIntrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) { |
773 | warning("X86 AES intrinsics require SSE3 instructions or higher. Intrinsics will be disabled." ); |
774 | } |
775 | FLAG_SET_DEFAULT(UseAESIntrinsics, false); |
776 | } |
777 | |
778 | // --AES-CTR begins-- |
779 | if (!UseAESIntrinsics) { |
780 | if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { |
781 | warning("AES-CTR intrinsics require UseAESIntrinsics flag to be enabled. Intrinsics will be disabled." ); |
782 | FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); |
783 | } |
784 | } else { |
785 | if (supports_sse4_1()) { |
786 | if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { |
787 | FLAG_SET_DEFAULT(UseAESCTRIntrinsics, true); |
788 | } |
789 | } else { |
790 | // The AES-CTR intrinsic stubs require AES instruction support (of course) |
791 | // but also require sse4.1 mode or higher for instructions it use. |
792 | if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { |
793 | warning("X86 AES-CTR intrinsics require SSE4.1 instructions or higher. Intrinsics will be disabled." ); |
794 | } |
795 | FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); |
796 | } |
797 | } |
798 | // --AES-CTR ends-- |
799 | } |
800 | } else if (UseAES || UseAESIntrinsics || UseAESCTRIntrinsics) { |
801 | if (UseAES && !FLAG_IS_DEFAULT(UseAES)) { |
802 | warning("AES instructions are not available on this CPU" ); |
803 | FLAG_SET_DEFAULT(UseAES, false); |
804 | } |
805 | if (UseAESIntrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) { |
806 | warning("AES intrinsics are not available on this CPU" ); |
807 | FLAG_SET_DEFAULT(UseAESIntrinsics, false); |
808 | } |
809 | if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { |
810 | warning("AES-CTR intrinsics are not available on this CPU" ); |
811 | FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); |
812 | } |
813 | } |
814 | |
815 | // Use CLMUL instructions if available. |
816 | if (supports_clmul()) { |
817 | if (FLAG_IS_DEFAULT(UseCLMUL)) { |
818 | UseCLMUL = true; |
819 | } |
820 | } else if (UseCLMUL) { |
821 | if (!FLAG_IS_DEFAULT(UseCLMUL)) |
822 | warning("CLMUL instructions not available on this CPU (AVX may also be required)" ); |
823 | FLAG_SET_DEFAULT(UseCLMUL, false); |
824 | } |
825 | |
826 | if (UseCLMUL && (UseSSE > 2)) { |
827 | if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) { |
828 | UseCRC32Intrinsics = true; |
829 | } |
830 | } else if (UseCRC32Intrinsics) { |
831 | if (!FLAG_IS_DEFAULT(UseCRC32Intrinsics)) |
832 | warning("CRC32 Intrinsics requires CLMUL instructions (not available on this CPU)" ); |
833 | FLAG_SET_DEFAULT(UseCRC32Intrinsics, false); |
834 | } |
835 | |
836 | if (supports_sse4_2() && supports_clmul()) { |
837 | if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) { |
838 | UseCRC32CIntrinsics = true; |
839 | } |
840 | } else if (UseCRC32CIntrinsics) { |
841 | if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) { |
842 | warning("CRC32C intrinsics are not available on this CPU" ); |
843 | } |
844 | FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false); |
845 | } |
846 | |
847 | // GHASH/GCM intrinsics |
848 | if (UseCLMUL && (UseSSE > 2)) { |
849 | if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { |
850 | UseGHASHIntrinsics = true; |
851 | } |
852 | } else if (UseGHASHIntrinsics) { |
853 | if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) |
854 | warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU" ); |
855 | FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); |
856 | } |
857 | |
858 | // Base64 Intrinsics (Check the condition for which the intrinsic will be active) |
859 | if ((UseAVX > 2) && supports_avx512vl() && supports_avx512bw()) { |
860 | if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) { |
861 | UseBASE64Intrinsics = true; |
862 | } |
863 | } else if (UseBASE64Intrinsics) { |
864 | if (!FLAG_IS_DEFAULT(UseBASE64Intrinsics)) |
865 | warning("Base64 intrinsic requires EVEX instructions on this CPU" ); |
866 | FLAG_SET_DEFAULT(UseBASE64Intrinsics, false); |
867 | } |
868 | |
869 | if (supports_fma() && UseSSE >= 2) { // Check UseSSE since FMA code uses SSE instructions |
870 | if (FLAG_IS_DEFAULT(UseFMA)) { |
871 | UseFMA = true; |
872 | } |
873 | } else if (UseFMA) { |
874 | warning("FMA instructions are not available on this CPU" ); |
875 | FLAG_SET_DEFAULT(UseFMA, false); |
876 | } |
877 | |
878 | if (supports_sha() LP64_ONLY(|| supports_avx2() && supports_bmi2())) { |
879 | if (FLAG_IS_DEFAULT(UseSHA)) { |
880 | UseSHA = true; |
881 | } |
882 | } else if (UseSHA) { |
883 | warning("SHA instructions are not available on this CPU" ); |
884 | FLAG_SET_DEFAULT(UseSHA, false); |
885 | } |
886 | |
887 | if (supports_sha() && supports_sse4_1() && UseSHA) { |
888 | if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) { |
889 | FLAG_SET_DEFAULT(UseSHA1Intrinsics, true); |
890 | } |
891 | } else if (UseSHA1Intrinsics) { |
892 | warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU." ); |
893 | FLAG_SET_DEFAULT(UseSHA1Intrinsics, false); |
894 | } |
895 | |
896 | if (supports_sse4_1() && UseSHA) { |
897 | if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) { |
898 | FLAG_SET_DEFAULT(UseSHA256Intrinsics, true); |
899 | } |
900 | } else if (UseSHA256Intrinsics) { |
901 | warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU." ); |
902 | FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); |
903 | } |
904 | |
905 | #ifdef _LP64 |
906 | // These are only supported on 64-bit |
907 | if (UseSHA && supports_avx2() && supports_bmi2()) { |
908 | if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) { |
909 | FLAG_SET_DEFAULT(UseSHA512Intrinsics, true); |
910 | } |
911 | } else |
912 | #endif |
913 | if (UseSHA512Intrinsics) { |
914 | warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU." ); |
915 | FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); |
916 | } |
917 | |
918 | if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) { |
919 | FLAG_SET_DEFAULT(UseSHA, false); |
920 | } |
921 | |
922 | if (UseAdler32Intrinsics) { |
923 | warning("Adler32Intrinsics not available on this CPU." ); |
924 | FLAG_SET_DEFAULT(UseAdler32Intrinsics, false); |
925 | } |
926 | |
927 | if (!supports_rtm() && UseRTMLocking) { |
928 | // Can't continue because UseRTMLocking affects UseBiasedLocking flag |
929 | // setting during arguments processing. See use_biased_locking(). |
930 | // VM_Version_init() is executed after UseBiasedLocking is used |
931 | // in Thread::allocate(). |
932 | vm_exit_during_initialization("RTM instructions are not available on this CPU" ); |
933 | } |
934 | |
935 | #if INCLUDE_RTM_OPT |
936 | if (UseRTMLocking) { |
937 | if (is_client_compilation_mode_vm()) { |
938 | // Only C2 does RTM locking optimization. |
939 | // Can't continue because UseRTMLocking affects UseBiasedLocking flag |
940 | // setting during arguments processing. See use_biased_locking(). |
941 | vm_exit_during_initialization("RTM locking optimization is not supported in this VM" ); |
942 | } |
943 | if (is_intel_family_core()) { |
944 | if ((_model == CPU_MODEL_HASWELL_E3) || |
945 | (_model == CPU_MODEL_HASWELL_E7 && _stepping < 3) || |
946 | (_model == CPU_MODEL_BROADWELL && _stepping < 4)) { |
947 | // currently a collision between SKL and HSW_E3 |
948 | if (!UnlockExperimentalVMOptions && UseAVX < 3) { |
949 | vm_exit_during_initialization("UseRTMLocking is only available as experimental option on this " |
950 | "platform. It must be enabled via -XX:+UnlockExperimentalVMOptions flag." ); |
951 | } else { |
952 | warning("UseRTMLocking is only available as experimental option on this platform." ); |
953 | } |
954 | } |
955 | } |
956 | if (!FLAG_IS_CMDLINE(UseRTMLocking)) { |
957 | // RTM locking should be used only for applications with |
958 | // high lock contention. For now we do not use it by default. |
959 | vm_exit_during_initialization("UseRTMLocking flag should be only set on command line" ); |
960 | } |
961 | } else { // !UseRTMLocking |
962 | if (UseRTMForStackLocks) { |
963 | if (!FLAG_IS_DEFAULT(UseRTMForStackLocks)) { |
964 | warning("UseRTMForStackLocks flag should be off when UseRTMLocking flag is off" ); |
965 | } |
966 | FLAG_SET_DEFAULT(UseRTMForStackLocks, false); |
967 | } |
968 | if (UseRTMDeopt) { |
969 | FLAG_SET_DEFAULT(UseRTMDeopt, false); |
970 | } |
971 | if (PrintPreciseRTMLockingStatistics) { |
972 | FLAG_SET_DEFAULT(PrintPreciseRTMLockingStatistics, false); |
973 | } |
974 | } |
975 | #else |
976 | if (UseRTMLocking) { |
977 | // Only C2 does RTM locking optimization. |
978 | // Can't continue because UseRTMLocking affects UseBiasedLocking flag |
979 | // setting during arguments processing. See use_biased_locking(). |
980 | vm_exit_during_initialization("RTM locking optimization is not supported in this VM" ); |
981 | } |
982 | #endif |
983 | |
984 | #ifdef COMPILER2 |
985 | if (UseFPUForSpilling) { |
986 | if (UseSSE < 2) { |
987 | // Only supported with SSE2+ |
988 | FLAG_SET_DEFAULT(UseFPUForSpilling, false); |
989 | } |
990 | } |
991 | #endif |
992 | |
993 | #if COMPILER2_OR_JVMCI |
994 | int max_vector_size = 0; |
995 | if (UseSSE < 2) { |
996 | // Vectors (in XMM) are only supported with SSE2+ |
997 | // SSE is always 2 on x64. |
998 | max_vector_size = 0; |
999 | } else if (UseAVX == 0 || !os_supports_avx_vectors()) { |
1000 | // 16 byte vectors (in XMM) are supported with SSE2+ |
1001 | max_vector_size = 16; |
1002 | } else if (UseAVX == 1 || UseAVX == 2) { |
1003 | // 32 bytes vectors (in YMM) are only supported with AVX+ |
1004 | max_vector_size = 32; |
1005 | } else if (UseAVX > 2) { |
1006 | // 64 bytes vectors (in ZMM) are only supported with AVX 3 |
1007 | max_vector_size = 64; |
1008 | } |
1009 | |
1010 | #ifdef _LP64 |
1011 | int min_vector_size = 4; // We require MaxVectorSize to be at least 4 on 64bit |
1012 | #else |
1013 | int min_vector_size = 0; |
1014 | #endif |
1015 | |
1016 | if (!FLAG_IS_DEFAULT(MaxVectorSize)) { |
1017 | if (MaxVectorSize < min_vector_size) { |
1018 | warning("MaxVectorSize must be at least %i on this platform" , min_vector_size); |
1019 | FLAG_SET_DEFAULT(MaxVectorSize, min_vector_size); |
1020 | } |
1021 | if (MaxVectorSize > max_vector_size) { |
1022 | warning("MaxVectorSize must be at most %i on this platform" , max_vector_size); |
1023 | FLAG_SET_DEFAULT(MaxVectorSize, max_vector_size); |
1024 | } |
1025 | if (!is_power_of_2(MaxVectorSize)) { |
1026 | warning("MaxVectorSize must be a power of 2, setting to default: %i" , max_vector_size); |
1027 | FLAG_SET_DEFAULT(MaxVectorSize, max_vector_size); |
1028 | } |
1029 | } else { |
1030 | // If default, use highest supported configuration |
1031 | FLAG_SET_DEFAULT(MaxVectorSize, max_vector_size); |
1032 | } |
1033 | |
1034 | #if defined(COMPILER2) && defined(ASSERT) |
1035 | if (MaxVectorSize > 0) { |
1036 | if (supports_avx() && PrintMiscellaneous && Verbose && TraceNewVectors) { |
1037 | tty->print_cr("State of YMM registers after signal handle:" ); |
1038 | int nreg = 2 LP64_ONLY(+2); |
1039 | const char* ymm_name[4] = {"0" , "7" , "8" , "15" }; |
1040 | for (int i = 0; i < nreg; i++) { |
1041 | tty->print("YMM%s:" , ymm_name[i]); |
1042 | for (int j = 7; j >=0; j--) { |
1043 | tty->print(" %x" , _cpuid_info.ymm_save[i*8 + j]); |
1044 | } |
1045 | tty->cr(); |
1046 | } |
1047 | } |
1048 | } |
1049 | #endif // COMPILER2 && ASSERT |
1050 | |
1051 | #ifdef _LP64 |
1052 | if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { |
1053 | UseMultiplyToLenIntrinsic = true; |
1054 | } |
1055 | if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) { |
1056 | UseSquareToLenIntrinsic = true; |
1057 | } |
1058 | if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) { |
1059 | UseMulAddIntrinsic = true; |
1060 | } |
1061 | if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) { |
1062 | UseMontgomeryMultiplyIntrinsic = true; |
1063 | } |
1064 | if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) { |
1065 | UseMontgomerySquareIntrinsic = true; |
1066 | } |
1067 | #else |
1068 | if (UseMultiplyToLenIntrinsic) { |
1069 | if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { |
1070 | warning("multiplyToLen intrinsic is not available in 32-bit VM" ); |
1071 | } |
1072 | FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false); |
1073 | } |
1074 | if (UseMontgomeryMultiplyIntrinsic) { |
1075 | if (!FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) { |
1076 | warning("montgomeryMultiply intrinsic is not available in 32-bit VM" ); |
1077 | } |
1078 | FLAG_SET_DEFAULT(UseMontgomeryMultiplyIntrinsic, false); |
1079 | } |
1080 | if (UseMontgomerySquareIntrinsic) { |
1081 | if (!FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) { |
1082 | warning("montgomerySquare intrinsic is not available in 32-bit VM" ); |
1083 | } |
1084 | FLAG_SET_DEFAULT(UseMontgomerySquareIntrinsic, false); |
1085 | } |
1086 | if (UseSquareToLenIntrinsic) { |
1087 | if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) { |
1088 | warning("squareToLen intrinsic is not available in 32-bit VM" ); |
1089 | } |
1090 | FLAG_SET_DEFAULT(UseSquareToLenIntrinsic, false); |
1091 | } |
1092 | if (UseMulAddIntrinsic) { |
1093 | if (!FLAG_IS_DEFAULT(UseMulAddIntrinsic)) { |
1094 | warning("mulAdd intrinsic is not available in 32-bit VM" ); |
1095 | } |
1096 | FLAG_SET_DEFAULT(UseMulAddIntrinsic, false); |
1097 | } |
1098 | #endif // _LP64 |
1099 | #endif // COMPILER2_OR_JVMCI |
1100 | |
1101 | // On new cpus instructions which update whole XMM register should be used |
1102 | // to prevent partial register stall due to dependencies on high half. |
1103 | // |
1104 | // UseXmmLoadAndClearUpper == true --> movsd(xmm, mem) |
1105 | // UseXmmLoadAndClearUpper == false --> movlpd(xmm, mem) |
1106 | // UseXmmRegToRegMoveAll == true --> movaps(xmm, xmm), movapd(xmm, xmm). |
1107 | // UseXmmRegToRegMoveAll == false --> movss(xmm, xmm), movsd(xmm, xmm). |
1108 | |
1109 | |
1110 | if (is_zx()) { // ZX cpus specific settings |
1111 | if (FLAG_IS_DEFAULT(UseStoreImmI16)) { |
1112 | UseStoreImmI16 = false; // don't use it on ZX cpus |
1113 | } |
1114 | if ((cpu_family() == 6) || (cpu_family() == 7)) { |
1115 | if (FLAG_IS_DEFAULT(UseAddressNop)) { |
1116 | // Use it on all ZX cpus |
1117 | UseAddressNop = true; |
1118 | } |
1119 | } |
1120 | if (FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper)) { |
1121 | UseXmmLoadAndClearUpper = true; // use movsd on all ZX cpus |
1122 | } |
1123 | if (FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll)) { |
1124 | if (supports_sse3()) { |
1125 | UseXmmRegToRegMoveAll = true; // use movaps, movapd on new ZX cpus |
1126 | } else { |
1127 | UseXmmRegToRegMoveAll = false; |
1128 | } |
1129 | } |
1130 | if (((cpu_family() == 6) || (cpu_family() == 7)) && supports_sse3()) { // new ZX cpus |
1131 | #ifdef COMPILER2 |
1132 | if (FLAG_IS_DEFAULT(MaxLoopPad)) { |
1133 | // For new ZX cpus do the next optimization: |
1134 | // don't align the beginning of a loop if there are enough instructions |
1135 | // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp) |
1136 | // in current fetch line (OptoLoopAlignment) or the padding |
1137 | // is big (> MaxLoopPad). |
1138 | // Set MaxLoopPad to 11 for new ZX cpus to reduce number of |
1139 | // generated NOP instructions. 11 is the largest size of one |
1140 | // address NOP instruction '0F 1F' (see Assembler::nop(i)). |
1141 | MaxLoopPad = 11; |
1142 | } |
1143 | #endif // COMPILER2 |
1144 | if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) { |
1145 | UseXMMForArrayCopy = true; // use SSE2 movq on new ZX cpus |
1146 | } |
1147 | if (supports_sse4_2()) { // new ZX cpus |
1148 | if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { |
1149 | UseUnalignedLoadStores = true; // use movdqu on newest ZX cpus |
1150 | } |
1151 | } |
1152 | if (supports_sse4_2()) { |
1153 | if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) { |
1154 | FLAG_SET_DEFAULT(UseSSE42Intrinsics, true); |
1155 | } |
1156 | } else { |
1157 | if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) { |
1158 | warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled." ); |
1159 | } |
1160 | FLAG_SET_DEFAULT(UseSSE42Intrinsics, false); |
1161 | } |
1162 | } |
1163 | |
1164 | if (FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) { |
1165 | FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3); |
1166 | } |
1167 | } |
1168 | |
1169 | if (is_amd_family()) { // AMD cpus specific settings |
1170 | if (supports_sse2() && FLAG_IS_DEFAULT(UseAddressNop)) { |
1171 | // Use it on new AMD cpus starting from Opteron. |
1172 | UseAddressNop = true; |
1173 | } |
1174 | if (supports_sse2() && FLAG_IS_DEFAULT(UseNewLongLShift)) { |
1175 | // Use it on new AMD cpus starting from Opteron. |
1176 | UseNewLongLShift = true; |
1177 | } |
1178 | if (FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper)) { |
1179 | if (supports_sse4a()) { |
1180 | UseXmmLoadAndClearUpper = true; // use movsd only on '10h' Opteron |
1181 | } else { |
1182 | UseXmmLoadAndClearUpper = false; |
1183 | } |
1184 | } |
1185 | if (FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll)) { |
1186 | if (supports_sse4a()) { |
1187 | UseXmmRegToRegMoveAll = true; // use movaps, movapd only on '10h' |
1188 | } else { |
1189 | UseXmmRegToRegMoveAll = false; |
1190 | } |
1191 | } |
1192 | if (FLAG_IS_DEFAULT(UseXmmI2F)) { |
1193 | if (supports_sse4a()) { |
1194 | UseXmmI2F = true; |
1195 | } else { |
1196 | UseXmmI2F = false; |
1197 | } |
1198 | } |
1199 | if (FLAG_IS_DEFAULT(UseXmmI2D)) { |
1200 | if (supports_sse4a()) { |
1201 | UseXmmI2D = true; |
1202 | } else { |
1203 | UseXmmI2D = false; |
1204 | } |
1205 | } |
1206 | if (supports_sse4_2()) { |
1207 | if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) { |
1208 | FLAG_SET_DEFAULT(UseSSE42Intrinsics, true); |
1209 | } |
1210 | } else { |
1211 | if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) { |
1212 | warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled." ); |
1213 | } |
1214 | FLAG_SET_DEFAULT(UseSSE42Intrinsics, false); |
1215 | } |
1216 | |
1217 | // some defaults for AMD family 15h |
1218 | if (cpu_family() == 0x15) { |
1219 | // On family 15h processors default is no sw prefetch |
1220 | if (FLAG_IS_DEFAULT(AllocatePrefetchStyle)) { |
1221 | FLAG_SET_DEFAULT(AllocatePrefetchStyle, 0); |
1222 | } |
1223 | // Also, if some other prefetch style is specified, default instruction type is PREFETCHW |
1224 | if (FLAG_IS_DEFAULT(AllocatePrefetchInstr)) { |
1225 | FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3); |
1226 | } |
1227 | // On family 15h processors use XMM and UnalignedLoadStores for Array Copy |
1228 | if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) { |
1229 | FLAG_SET_DEFAULT(UseXMMForArrayCopy, true); |
1230 | } |
1231 | if (supports_sse2() && FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { |
1232 | FLAG_SET_DEFAULT(UseUnalignedLoadStores, true); |
1233 | } |
1234 | } |
1235 | |
1236 | #ifdef COMPILER2 |
1237 | if (cpu_family() < 0x17 && MaxVectorSize > 16) { |
1238 | // Limit vectors size to 16 bytes on AMD cpus < 17h. |
1239 | FLAG_SET_DEFAULT(MaxVectorSize, 16); |
1240 | } |
1241 | #endif // COMPILER2 |
1242 | |
1243 | // Some defaults for AMD family 17h || Hygon family 18h |
1244 | if (cpu_family() == 0x17 || cpu_family() == 0x18) { |
1245 | // On family 17h processors use XMM and UnalignedLoadStores for Array Copy |
1246 | if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) { |
1247 | FLAG_SET_DEFAULT(UseXMMForArrayCopy, true); |
1248 | } |
1249 | if (supports_sse2() && FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { |
1250 | FLAG_SET_DEFAULT(UseUnalignedLoadStores, true); |
1251 | } |
1252 | #ifdef COMPILER2 |
1253 | if (supports_sse4_2() && FLAG_IS_DEFAULT(UseFPUForSpilling)) { |
1254 | FLAG_SET_DEFAULT(UseFPUForSpilling, true); |
1255 | } |
1256 | #endif |
1257 | } |
1258 | } |
1259 | |
1260 | if (is_intel()) { // Intel cpus specific settings |
1261 | if (FLAG_IS_DEFAULT(UseStoreImmI16)) { |
1262 | UseStoreImmI16 = false; // don't use it on Intel cpus |
1263 | } |
1264 | if (cpu_family() == 6 || cpu_family() == 15) { |
1265 | if (FLAG_IS_DEFAULT(UseAddressNop)) { |
1266 | // Use it on all Intel cpus starting from PentiumPro |
1267 | UseAddressNop = true; |
1268 | } |
1269 | } |
1270 | if (FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper)) { |
1271 | UseXmmLoadAndClearUpper = true; // use movsd on all Intel cpus |
1272 | } |
1273 | if (FLAG_IS_DEFAULT(UseXmmRegToRegMoveAll)) { |
1274 | if (supports_sse3()) { |
1275 | UseXmmRegToRegMoveAll = true; // use movaps, movapd on new Intel cpus |
1276 | } else { |
1277 | UseXmmRegToRegMoveAll = false; |
1278 | } |
1279 | } |
1280 | if (cpu_family() == 6 && supports_sse3()) { // New Intel cpus |
1281 | #ifdef COMPILER2 |
1282 | if (FLAG_IS_DEFAULT(MaxLoopPad)) { |
1283 | // For new Intel cpus do the next optimization: |
1284 | // don't align the beginning of a loop if there are enough instructions |
1285 | // left (NumberOfLoopInstrToAlign defined in c2_globals.hpp) |
1286 | // in current fetch line (OptoLoopAlignment) or the padding |
1287 | // is big (> MaxLoopPad). |
1288 | // Set MaxLoopPad to 11 for new Intel cpus to reduce number of |
1289 | // generated NOP instructions. 11 is the largest size of one |
1290 | // address NOP instruction '0F 1F' (see Assembler::nop(i)). |
1291 | MaxLoopPad = 11; |
1292 | } |
1293 | #endif // COMPILER2 |
1294 | if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) { |
1295 | UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus |
1296 | } |
1297 | if ((supports_sse4_2() && supports_ht()) || supports_avx()) { // Newest Intel cpus |
1298 | if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { |
1299 | UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus |
1300 | } |
1301 | } |
1302 | if (supports_sse4_2()) { |
1303 | if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) { |
1304 | FLAG_SET_DEFAULT(UseSSE42Intrinsics, true); |
1305 | } |
1306 | } else { |
1307 | if (UseSSE42Intrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) { |
1308 | warning("SSE4.2 intrinsics require SSE4.2 instructions or higher. Intrinsics will be disabled." ); |
1309 | } |
1310 | FLAG_SET_DEFAULT(UseSSE42Intrinsics, false); |
1311 | } |
1312 | } |
1313 | if (is_atom_family() || is_knights_family()) { |
1314 | #ifdef COMPILER2 |
1315 | if (FLAG_IS_DEFAULT(OptoScheduling)) { |
1316 | OptoScheduling = true; |
1317 | } |
1318 | #endif |
1319 | if (supports_sse4_2()) { // Silvermont |
1320 | if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { |
1321 | UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus |
1322 | } |
1323 | } |
1324 | if (FLAG_IS_DEFAULT(UseIncDec)) { |
1325 | FLAG_SET_DEFAULT(UseIncDec, false); |
1326 | } |
1327 | } |
1328 | if (FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) { |
1329 | FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3); |
1330 | } |
1331 | } |
1332 | |
1333 | #ifdef _LP64 |
1334 | if (UseSSE42Intrinsics) { |
1335 | if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) { |
1336 | UseVectorizedMismatchIntrinsic = true; |
1337 | } |
1338 | } else if (UseVectorizedMismatchIntrinsic) { |
1339 | if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) |
1340 | warning("vectorizedMismatch intrinsics are not available on this CPU" ); |
1341 | FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false); |
1342 | } |
1343 | #else |
1344 | if (UseVectorizedMismatchIntrinsic) { |
1345 | if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) { |
1346 | warning("vectorizedMismatch intrinsic is not available in 32-bit VM" ); |
1347 | } |
1348 | FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false); |
1349 | } |
1350 | #endif // _LP64 |
1351 | |
1352 | // Use count leading zeros count instruction if available. |
1353 | if (supports_lzcnt()) { |
1354 | if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) { |
1355 | UseCountLeadingZerosInstruction = true; |
1356 | } |
1357 | } else if (UseCountLeadingZerosInstruction) { |
1358 | warning("lzcnt instruction is not available on this CPU" ); |
1359 | FLAG_SET_DEFAULT(UseCountLeadingZerosInstruction, false); |
1360 | } |
1361 | |
1362 | // Use count trailing zeros instruction if available |
1363 | if (supports_bmi1()) { |
1364 | // tzcnt does not require VEX prefix |
1365 | if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) { |
1366 | if (!UseBMI1Instructions && !FLAG_IS_DEFAULT(UseBMI1Instructions)) { |
1367 | // Don't use tzcnt if BMI1 is switched off on command line. |
1368 | UseCountTrailingZerosInstruction = false; |
1369 | } else { |
1370 | UseCountTrailingZerosInstruction = true; |
1371 | } |
1372 | } |
1373 | } else if (UseCountTrailingZerosInstruction) { |
1374 | warning("tzcnt instruction is not available on this CPU" ); |
1375 | FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false); |
1376 | } |
1377 | |
1378 | // BMI instructions (except tzcnt) use an encoding with VEX prefix. |
1379 | // VEX prefix is generated only when AVX > 0. |
1380 | if (supports_bmi1() && supports_avx()) { |
1381 | if (FLAG_IS_DEFAULT(UseBMI1Instructions)) { |
1382 | UseBMI1Instructions = true; |
1383 | } |
1384 | } else if (UseBMI1Instructions) { |
1385 | warning("BMI1 instructions are not available on this CPU (AVX is also required)" ); |
1386 | FLAG_SET_DEFAULT(UseBMI1Instructions, false); |
1387 | } |
1388 | |
1389 | if (supports_bmi2() && supports_avx()) { |
1390 | if (FLAG_IS_DEFAULT(UseBMI2Instructions)) { |
1391 | UseBMI2Instructions = true; |
1392 | } |
1393 | } else if (UseBMI2Instructions) { |
1394 | warning("BMI2 instructions are not available on this CPU (AVX is also required)" ); |
1395 | FLAG_SET_DEFAULT(UseBMI2Instructions, false); |
1396 | } |
1397 | |
1398 | // Use population count instruction if available. |
1399 | if (supports_popcnt()) { |
1400 | if (FLAG_IS_DEFAULT(UsePopCountInstruction)) { |
1401 | UsePopCountInstruction = true; |
1402 | } |
1403 | } else if (UsePopCountInstruction) { |
1404 | warning("POPCNT instruction is not available on this CPU" ); |
1405 | FLAG_SET_DEFAULT(UsePopCountInstruction, false); |
1406 | } |
1407 | |
1408 | // Use fast-string operations if available. |
1409 | if (supports_erms()) { |
1410 | if (FLAG_IS_DEFAULT(UseFastStosb)) { |
1411 | UseFastStosb = true; |
1412 | } |
1413 | } else if (UseFastStosb) { |
1414 | warning("fast-string operations are not available on this CPU" ); |
1415 | FLAG_SET_DEFAULT(UseFastStosb, false); |
1416 | } |
1417 | |
1418 | // Use XMM/YMM MOVDQU instruction for Object Initialization |
1419 | if (!UseFastStosb && UseSSE >= 2 && UseUnalignedLoadStores) { |
1420 | if (FLAG_IS_DEFAULT(UseXMMForObjInit)) { |
1421 | UseXMMForObjInit = true; |
1422 | } |
1423 | } else if (UseXMMForObjInit) { |
1424 | warning("UseXMMForObjInit requires SSE2 and unaligned load/stores. Feature is switched off." ); |
1425 | FLAG_SET_DEFAULT(UseXMMForObjInit, false); |
1426 | } |
1427 | |
1428 | #ifdef COMPILER2 |
1429 | if (FLAG_IS_DEFAULT(AlignVector)) { |
1430 | // Modern processors allow misaligned memory operations for vectors. |
1431 | AlignVector = !UseUnalignedLoadStores; |
1432 | } |
1433 | #endif // COMPILER2 |
1434 | |
1435 | if (FLAG_IS_DEFAULT(AllocatePrefetchInstr)) { |
1436 | if (AllocatePrefetchInstr == 3 && !supports_3dnow_prefetch()) { |
1437 | FLAG_SET_DEFAULT(AllocatePrefetchInstr, 0); |
1438 | } else if (!supports_sse() && supports_3dnow_prefetch()) { |
1439 | FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3); |
1440 | } |
1441 | } |
1442 | |
1443 | // Allocation prefetch settings |
1444 | intx cache_line_size = prefetch_data_size(); |
1445 | if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize) && |
1446 | (cache_line_size > AllocatePrefetchStepSize)) { |
1447 | FLAG_SET_DEFAULT(AllocatePrefetchStepSize, cache_line_size); |
1448 | } |
1449 | |
1450 | if ((AllocatePrefetchDistance == 0) && (AllocatePrefetchStyle != 0)) { |
1451 | assert(!FLAG_IS_DEFAULT(AllocatePrefetchDistance), "default value should not be 0" ); |
1452 | if (!FLAG_IS_DEFAULT(AllocatePrefetchStyle)) { |
1453 | warning("AllocatePrefetchDistance is set to 0 which disable prefetching. Ignoring AllocatePrefetchStyle flag." ); |
1454 | } |
1455 | FLAG_SET_DEFAULT(AllocatePrefetchStyle, 0); |
1456 | } |
1457 | |
1458 | if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { |
1459 | bool use_watermark_prefetch = (AllocatePrefetchStyle == 2); |
1460 | FLAG_SET_DEFAULT(AllocatePrefetchDistance, allocate_prefetch_distance(use_watermark_prefetch)); |
1461 | } |
1462 | |
1463 | if (is_intel() && cpu_family() == 6 && supports_sse3()) { |
1464 | if (FLAG_IS_DEFAULT(AllocatePrefetchLines) && |
1465 | supports_sse4_2() && supports_ht()) { // Nehalem based cpus |
1466 | FLAG_SET_DEFAULT(AllocatePrefetchLines, 4); |
1467 | } |
1468 | #ifdef COMPILER2 |
1469 | if (FLAG_IS_DEFAULT(UseFPUForSpilling) && supports_sse4_2()) { |
1470 | FLAG_SET_DEFAULT(UseFPUForSpilling, true); |
1471 | } |
1472 | #endif |
1473 | } |
1474 | |
1475 | if (is_zx() && ((cpu_family() == 6) || (cpu_family() == 7)) && supports_sse4_2()) { |
1476 | #ifdef COMPILER2 |
1477 | if (FLAG_IS_DEFAULT(UseFPUForSpilling)) { |
1478 | FLAG_SET_DEFAULT(UseFPUForSpilling, true); |
1479 | } |
1480 | #endif |
1481 | } |
1482 | |
1483 | #ifdef _LP64 |
1484 | // Prefetch settings |
1485 | |
1486 | // Prefetch interval for gc copy/scan == 9 dcache lines. Derived from |
1487 | // 50-warehouse specjbb runs on a 2-way 1.8ghz opteron using a 4gb heap. |
1488 | // Tested intervals from 128 to 2048 in increments of 64 == one cache line. |
1489 | // 256 bytes (4 dcache lines) was the nearest runner-up to 576. |
1490 | |
1491 | // gc copy/scan is disabled if prefetchw isn't supported, because |
1492 | // Prefetch::write emits an inlined prefetchw on Linux. |
1493 | // Do not use the 3dnow prefetchw instruction. It isn't supported on em64t. |
1494 | // The used prefetcht0 instruction works for both amd64 and em64t. |
1495 | |
1496 | if (FLAG_IS_DEFAULT(PrefetchCopyIntervalInBytes)) { |
1497 | FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 576); |
1498 | } |
1499 | if (FLAG_IS_DEFAULT(PrefetchScanIntervalInBytes)) { |
1500 | FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 576); |
1501 | } |
1502 | if (FLAG_IS_DEFAULT(PrefetchFieldsAhead)) { |
1503 | FLAG_SET_DEFAULT(PrefetchFieldsAhead, 1); |
1504 | } |
1505 | #endif |
1506 | |
1507 | if (FLAG_IS_DEFAULT(ContendedPaddingWidth) && |
1508 | (cache_line_size > ContendedPaddingWidth)) |
1509 | ContendedPaddingWidth = cache_line_size; |
1510 | |
1511 | // This machine allows unaligned memory accesses |
1512 | if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) { |
1513 | FLAG_SET_DEFAULT(UseUnalignedAccesses, true); |
1514 | } |
1515 | |
1516 | #ifndef PRODUCT |
1517 | if (log_is_enabled(Info, os, cpu)) { |
1518 | LogStream ls(Log(os, cpu)::info()); |
1519 | outputStream* log = &ls; |
1520 | log->print_cr("Logical CPUs per core: %u" , |
1521 | logical_processors_per_package()); |
1522 | log->print_cr("L1 data cache line size: %u" , L1_data_cache_line_size()); |
1523 | log->print("UseSSE=%d" , (int) UseSSE); |
1524 | if (UseAVX > 0) { |
1525 | log->print(" UseAVX=%d" , (int) UseAVX); |
1526 | } |
1527 | if (UseAES) { |
1528 | log->print(" UseAES=1" ); |
1529 | } |
1530 | #ifdef COMPILER2 |
1531 | if (MaxVectorSize > 0) { |
1532 | log->print(" MaxVectorSize=%d" , (int) MaxVectorSize); |
1533 | } |
1534 | #endif |
1535 | log->cr(); |
1536 | log->print("Allocation" ); |
1537 | if (AllocatePrefetchStyle <= 0 || (UseSSE == 0 && !supports_3dnow_prefetch())) { |
1538 | log->print_cr(": no prefetching" ); |
1539 | } else { |
1540 | log->print(" prefetching: " ); |
1541 | if (UseSSE == 0 && supports_3dnow_prefetch()) { |
1542 | log->print("PREFETCHW" ); |
1543 | } else if (UseSSE >= 1) { |
1544 | if (AllocatePrefetchInstr == 0) { |
1545 | log->print("PREFETCHNTA" ); |
1546 | } else if (AllocatePrefetchInstr == 1) { |
1547 | log->print("PREFETCHT0" ); |
1548 | } else if (AllocatePrefetchInstr == 2) { |
1549 | log->print("PREFETCHT2" ); |
1550 | } else if (AllocatePrefetchInstr == 3) { |
1551 | log->print("PREFETCHW" ); |
1552 | } |
1553 | } |
1554 | if (AllocatePrefetchLines > 1) { |
1555 | log->print_cr(" at distance %d, %d lines of %d bytes" , (int) AllocatePrefetchDistance, (int) AllocatePrefetchLines, (int) AllocatePrefetchStepSize); |
1556 | } else { |
1557 | log->print_cr(" at distance %d, one line of %d bytes" , (int) AllocatePrefetchDistance, (int) AllocatePrefetchStepSize); |
1558 | } |
1559 | } |
1560 | |
1561 | if (PrefetchCopyIntervalInBytes > 0) { |
1562 | log->print_cr("PrefetchCopyIntervalInBytes %d" , (int) PrefetchCopyIntervalInBytes); |
1563 | } |
1564 | if (PrefetchScanIntervalInBytes > 0) { |
1565 | log->print_cr("PrefetchScanIntervalInBytes %d" , (int) PrefetchScanIntervalInBytes); |
1566 | } |
1567 | if (PrefetchFieldsAhead > 0) { |
1568 | log->print_cr("PrefetchFieldsAhead %d" , (int) PrefetchFieldsAhead); |
1569 | } |
1570 | if (ContendedPaddingWidth > 0) { |
1571 | log->print_cr("ContendedPaddingWidth %d" , (int) ContendedPaddingWidth); |
1572 | } |
1573 | } |
1574 | #endif // !PRODUCT |
1575 | } |
1576 | |
1577 | void VM_Version::print_platform_virtualization_info(outputStream* st) { |
1578 | VirtualizationType vrt = VM_Version::get_detected_virtualization(); |
1579 | if (vrt == XenHVM) { |
1580 | st->print_cr("Xen hardware-assisted virtualization detected" ); |
1581 | } else if (vrt == KVM) { |
1582 | st->print_cr("KVM virtualization detected" ); |
1583 | } else if (vrt == VMWare) { |
1584 | st->print_cr("VMWare virtualization detected" ); |
1585 | VirtualizationSupport::print_virtualization_info(st); |
1586 | } else if (vrt == HyperV) { |
1587 | st->print_cr("HyperV virtualization detected" ); |
1588 | } |
1589 | } |
1590 | |
1591 | void VM_Version::check_virt_cpuid(uint32_t idx, uint32_t *regs) { |
1592 | // TODO support 32 bit |
1593 | #if defined(_LP64) |
1594 | #if defined(_MSC_VER) |
1595 | // Allocate space for the code |
1596 | const int code_size = 100; |
1597 | ResourceMark rm; |
1598 | CodeBuffer cb("detect_virt" , code_size, 0); |
1599 | MacroAssembler* a = new MacroAssembler(&cb); |
1600 | address code = a->pc(); |
1601 | void (*test)(uint32_t idx, uint32_t *regs) = (void(*)(uint32_t idx, uint32_t *regs))code; |
1602 | |
1603 | a->movq(r9, rbx); // save nonvolatile register |
1604 | |
1605 | // next line would not work on 32-bit |
1606 | a->movq(rax, c_rarg0 /* rcx */); |
1607 | a->movq(r8, c_rarg1 /* rdx */); |
1608 | a->cpuid(); |
1609 | a->movl(Address(r8, 0), rax); |
1610 | a->movl(Address(r8, 4), rbx); |
1611 | a->movl(Address(r8, 8), rcx); |
1612 | a->movl(Address(r8, 12), rdx); |
1613 | |
1614 | a->movq(rbx, r9); // restore nonvolatile register |
1615 | a->ret(0); |
1616 | |
1617 | uint32_t *code_end = (uint32_t *)a->pc(); |
1618 | a->flush(); |
1619 | |
1620 | // execute code |
1621 | (*test)(idx, regs); |
1622 | #elif defined(__GNUC__) |
1623 | __asm__ volatile ( |
1624 | " cpuid;" |
1625 | " mov %%eax,(%1);" |
1626 | " mov %%ebx,4(%1);" |
1627 | " mov %%ecx,8(%1);" |
1628 | " mov %%edx,12(%1);" |
1629 | : "+a" (idx) |
1630 | : "S" (regs) |
1631 | : "ebx" , "ecx" , "edx" , "memory" ); |
1632 | #endif |
1633 | #endif |
1634 | } |
1635 | |
1636 | |
1637 | bool VM_Version::use_biased_locking() { |
1638 | #if INCLUDE_RTM_OPT |
1639 | // RTM locking is most useful when there is high lock contention and |
1640 | // low data contention. With high lock contention the lock is usually |
1641 | // inflated and biased locking is not suitable for that case. |
1642 | // RTM locking code requires that biased locking is off. |
1643 | // Note: we can't switch off UseBiasedLocking in get_processor_features() |
1644 | // because it is used by Thread::allocate() which is called before |
1645 | // VM_Version::initialize(). |
1646 | if (UseRTMLocking && UseBiasedLocking) { |
1647 | if (FLAG_IS_DEFAULT(UseBiasedLocking)) { |
1648 | FLAG_SET_DEFAULT(UseBiasedLocking, false); |
1649 | } else { |
1650 | warning("Biased locking is not supported with RTM locking; ignoring UseBiasedLocking flag." ); |
1651 | UseBiasedLocking = false; |
1652 | } |
1653 | } |
1654 | #endif |
1655 | return UseBiasedLocking; |
1656 | } |
1657 | |
1658 | // On Xen, the cpuid instruction returns |
1659 | // eax / registers[0]: Version of Xen |
1660 | // ebx / registers[1]: chars 'XenV' |
1661 | // ecx / registers[2]: chars 'MMXe' |
1662 | // edx / registers[3]: chars 'nVMM' |
1663 | // |
1664 | // On KVM / VMWare / MS Hyper-V, the cpuid instruction returns |
1665 | // ebx / registers[1]: chars 'KVMK' / 'VMwa' / 'Micr' |
1666 | // ecx / registers[2]: chars 'VMKV' / 'reVM' / 'osof' |
1667 | // edx / registers[3]: chars 'M' / 'ware' / 't Hv' |
1668 | // |
1669 | // more information : |
1670 | // https://kb.vmware.com/s/article/1009458 |
1671 | // |
1672 | void VM_Version::check_virtualizations() { |
1673 | #if defined(_LP64) |
1674 | uint32_t registers[4]; |
1675 | char signature[13]; |
1676 | uint32_t base; |
1677 | signature[12] = '\0'; |
1678 | memset((void*)registers, 0, 4*sizeof(uint32_t)); |
1679 | |
1680 | for (base = 0x40000000; base < 0x40010000; base += 0x100) { |
1681 | check_virt_cpuid(base, registers); |
1682 | |
1683 | *(uint32_t *)(signature + 0) = registers[1]; |
1684 | *(uint32_t *)(signature + 4) = registers[2]; |
1685 | *(uint32_t *)(signature + 8) = registers[3]; |
1686 | |
1687 | if (strncmp("VMwareVMware" , signature, 12) == 0) { |
1688 | Abstract_VM_Version::_detected_virtualization = VMWare; |
1689 | // check for extended metrics from guestlib |
1690 | VirtualizationSupport::initialize(); |
1691 | } |
1692 | |
1693 | if (strncmp("Microsoft Hv" , signature, 12) == 0) { |
1694 | Abstract_VM_Version::_detected_virtualization = HyperV; |
1695 | } |
1696 | |
1697 | if (strncmp("KVMKVMKVM" , signature, 9) == 0) { |
1698 | Abstract_VM_Version::_detected_virtualization = KVM; |
1699 | } |
1700 | |
1701 | if (strncmp("XenVMMXenVMM" , signature, 12) == 0) { |
1702 | Abstract_VM_Version::_detected_virtualization = XenHVM; |
1703 | } |
1704 | } |
1705 | #endif |
1706 | } |
1707 | |
1708 | void VM_Version::initialize() { |
1709 | ResourceMark rm; |
1710 | // Making this stub must be FIRST use of assembler |
1711 | |
1712 | stub_blob = BufferBlob::create("get_cpu_info_stub" , stub_size); |
1713 | if (stub_blob == NULL) { |
1714 | vm_exit_during_initialization("Unable to allocate get_cpu_info_stub" ); |
1715 | } |
1716 | CodeBuffer c(stub_blob); |
1717 | VM_Version_StubGenerator g(&c); |
1718 | get_cpu_info_stub = CAST_TO_FN_PTR(get_cpu_info_stub_t, |
1719 | g.generate_get_cpu_info()); |
1720 | |
1721 | get_processor_features(); |
1722 | if (cpu_family() > 4) { // it supports CPUID |
1723 | check_virtualizations(); |
1724 | } |
1725 | } |
1726 | |