1 /*
2 * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "jvm.h"
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "compiler/compiler_globals.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/collectedHeap.inline.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/bytecodeHistogram.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "memory/resourceArea.hpp"
38 #include "memory/universe.hpp"
39 #include "oops/accessDecorators.hpp"
40 #include "oops/compressedOops.inline.hpp"
41 #include "oops/klass.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "runtime/biasedLocking.hpp"
44 #include "runtime/flags/flagSetting.hpp"
45 #include "runtime/interfaceSupport.inline.hpp"
46 #include "runtime/jniHandles.hpp"
47 #include "runtime/objectMonitor.hpp"
48 #include "runtime/os.hpp"
49 #include "runtime/safepoint.hpp"
50 #include "runtime/safepointMechanism.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubRoutines.hpp"
53 #include "runtime/thread.hpp"
54 #include "utilities/macros.hpp"
55 #include "crc32c.h"
56
57 #ifdef PRODUCT
58 #define BLOCK_COMMENT(str) /* nothing */
59 #define STOP(error) stop(error)
60 #else
61 #define BLOCK_COMMENT(str) block_comment(str)
62 #define STOP(error) block_comment(error); stop(error)
63 #endif
64
65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
66
67 #ifdef ASSERT
68 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
69 #endif
70
71 static Assembler::Condition reverse[] = {
72 Assembler::noOverflow /* overflow = 0x0 */ ,
73 Assembler::overflow /* noOverflow = 0x1 */ ,
74 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
75 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
76 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
77 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
78 Assembler::above /* belowEqual = 0x6 */ ,
79 Assembler::belowEqual /* above = 0x7 */ ,
80 Assembler::positive /* negative = 0x8 */ ,
81 Assembler::negative /* positive = 0x9 */ ,
82 Assembler::noParity /* parity = 0xa */ ,
83 Assembler::parity /* noParity = 0xb */ ,
84 Assembler::greaterEqual /* less = 0xc */ ,
85 Assembler::less /* greaterEqual = 0xd */ ,
86 Assembler::greater /* lessEqual = 0xe */ ,
87 Assembler::lessEqual /* greater = 0xf, */
88
89 };
90
91
92 // Implementation of MacroAssembler
93
94 // First all the versions that have distinct versions depending on 32/64 bit
95 // Unless the difference is trivial (1 line or so).
96
97 #ifndef _LP64
98
99 // 32bit versions
100
101 Address MacroAssembler::as_Address(AddressLiteral adr) {
102 return Address(adr.target(), adr.rspec());
103 }
104
105 Address MacroAssembler::as_Address(ArrayAddress adr) {
106 return Address::make_array(adr);
107 }
108
109 void MacroAssembler::call_VM_leaf_base(address entry_point,
110 int number_of_arguments) {
111 call(RuntimeAddress(entry_point));
112 increment(rsp, number_of_arguments * wordSize);
113 }
114
115 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
116 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
117 }
118
119
120 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
121 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
122 }
123
124 void MacroAssembler::cmpoop(Address src1, jobject obj) {
125 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
126 }
127
128 void MacroAssembler::cmpoop(Register src1, jobject obj) {
129 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
130 }
131
132 void MacroAssembler::extend_sign(Register hi, Register lo) {
133 // According to Intel Doc. AP-526, "Integer Divide", p.18.
134 if (VM_Version::is_P6() && hi == rdx && lo == rax) {
135 cdql();
136 } else {
137 movl(hi, lo);
138 sarl(hi, 31);
139 }
140 }
141
142 void MacroAssembler::jC2(Register tmp, Label& L) {
143 // set parity bit if FPU flag C2 is set (via rax)
144 save_rax(tmp);
145 fwait(); fnstsw_ax();
146 sahf();
147 restore_rax(tmp);
148 // branch
149 jcc(Assembler::parity, L);
150 }
151
152 void MacroAssembler::jnC2(Register tmp, Label& L) {
153 // set parity bit if FPU flag C2 is set (via rax)
154 save_rax(tmp);
155 fwait(); fnstsw_ax();
156 sahf();
157 restore_rax(tmp);
158 // branch
159 jcc(Assembler::noParity, L);
160 }
161
162 // 32bit can do a case table jump in one instruction but we no longer allow the base
163 // to be installed in the Address class
164 void MacroAssembler::jump(ArrayAddress entry) {
165 jmp(as_Address(entry));
166 }
167
168 // Note: y_lo will be destroyed
169 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
170 // Long compare for Java (semantics as described in JVM spec.)
171 Label high, low, done;
172
173 cmpl(x_hi, y_hi);
174 jcc(Assembler::less, low);
175 jcc(Assembler::greater, high);
176 // x_hi is the return register
177 xorl(x_hi, x_hi);
178 cmpl(x_lo, y_lo);
179 jcc(Assembler::below, low);
180 jcc(Assembler::equal, done);
181
182 bind(high);
183 xorl(x_hi, x_hi);
184 increment(x_hi);
185 jmp(done);
186
187 bind(low);
188 xorl(x_hi, x_hi);
189 decrementl(x_hi);
190
191 bind(done);
192 }
193
194 void MacroAssembler::lea(Register dst, AddressLiteral src) {
195 mov_literal32(dst, (int32_t)src.target(), src.rspec());
196 }
197
198 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
199 // leal(dst, as_Address(adr));
200 // see note in movl as to why we must use a move
201 mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
202 }
203
204 void MacroAssembler::leave() {
205 mov(rsp, rbp);
206 pop(rbp);
207 }
208
209 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
210 // Multiplication of two Java long values stored on the stack
211 // as illustrated below. Result is in rdx:rax.
212 //
213 // rsp ---> [ ?? ] \ \
214 // .... | y_rsp_offset |
215 // [ y_lo ] / (in bytes) | x_rsp_offset
216 // [ y_hi ] | (in bytes)
217 // .... |
218 // [ x_lo ] /
219 // [ x_hi ]
220 // ....
221 //
222 // Basic idea: lo(result) = lo(x_lo * y_lo)
223 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
224 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
225 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
226 Label quick;
227 // load x_hi, y_hi and check if quick
228 // multiplication is possible
229 movl(rbx, x_hi);
230 movl(rcx, y_hi);
231 movl(rax, rbx);
232 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0
233 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply
234 // do full multiplication
235 // 1st step
236 mull(y_lo); // x_hi * y_lo
237 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx,
238 // 2nd step
239 movl(rax, x_lo);
240 mull(rcx); // x_lo * y_hi
241 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx,
242 // 3rd step
243 bind(quick); // note: rbx, = 0 if quick multiply!
244 movl(rax, x_lo);
245 mull(y_lo); // x_lo * y_lo
246 addl(rdx, rbx); // correct hi(x_lo * y_lo)
247 }
248
249 void MacroAssembler::lneg(Register hi, Register lo) {
250 negl(lo);
251 adcl(hi, 0);
252 negl(hi);
253 }
254
255 void MacroAssembler::lshl(Register hi, Register lo) {
256 // Java shift left long support (semantics as described in JVM spec., p.305)
257 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
258 // shift value is in rcx !
259 assert(hi != rcx, "must not use rcx");
260 assert(lo != rcx, "must not use rcx");
261 const Register s = rcx; // shift count
262 const int n = BitsPerWord;
263 Label L;
264 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
265 cmpl(s, n); // if (s < n)
266 jcc(Assembler::less, L); // else (s >= n)
267 movl(hi, lo); // x := x << n
268 xorl(lo, lo);
269 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
270 bind(L); // s (mod n) < n
271 shldl(hi, lo); // x := x << s
272 shll(lo);
273 }
274
275
276 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
277 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
278 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
279 assert(hi != rcx, "must not use rcx");
280 assert(lo != rcx, "must not use rcx");
281 const Register s = rcx; // shift count
282 const int n = BitsPerWord;
283 Label L;
284 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
285 cmpl(s, n); // if (s < n)
286 jcc(Assembler::less, L); // else (s >= n)
287 movl(lo, hi); // x := x >> n
288 if (sign_extension) sarl(hi, 31);
289 else xorl(hi, hi);
290 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
291 bind(L); // s (mod n) < n
292 shrdl(lo, hi); // x := x >> s
293 if (sign_extension) sarl(hi);
294 else shrl(hi);
295 }
296
297 void MacroAssembler::movoop(Register dst, jobject obj) {
298 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
299 }
300
301 void MacroAssembler::movoop(Address dst, jobject obj) {
302 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
303 }
304
305 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
306 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
307 }
308
309 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
310 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
311 }
312
313 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
314 // scratch register is not used,
315 // it is defined to match parameters of 64-bit version of this method.
316 if (src.is_lval()) {
317 mov_literal32(dst, (intptr_t)src.target(), src.rspec());
318 } else {
319 movl(dst, as_Address(src));
320 }
321 }
322
323 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
324 movl(as_Address(dst), src);
325 }
326
327 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
328 movl(dst, as_Address(src));
329 }
330
331 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
332 void MacroAssembler::movptr(Address dst, intptr_t src) {
333 movl(dst, src);
334 }
335
336
337 void MacroAssembler::pop_callee_saved_registers() {
338 pop(rcx);
339 pop(rdx);
340 pop(rdi);
341 pop(rsi);
342 }
343
344 void MacroAssembler::push_callee_saved_registers() {
345 push(rsi);
346 push(rdi);
347 push(rdx);
348 push(rcx);
349 }
350
351 void MacroAssembler::pushoop(jobject obj) {
352 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
353 }
354
355 void MacroAssembler::pushklass(Metadata* obj) {
356 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
357 }
358
359 void MacroAssembler::pushptr(AddressLiteral src) {
360 if (src.is_lval()) {
361 push_literal32((int32_t)src.target(), src.rspec());
362 } else {
363 pushl(as_Address(src));
364 }
365 }
366
367 static void pass_arg0(MacroAssembler* masm, Register arg) {
368 masm->push(arg);
369 }
370
371 static void pass_arg1(MacroAssembler* masm, Register arg) {
372 masm->push(arg);
373 }
374
375 static void pass_arg2(MacroAssembler* masm, Register arg) {
376 masm->push(arg);
377 }
378
379 static void pass_arg3(MacroAssembler* masm, Register arg) {
380 masm->push(arg);
381 }
382
383 #ifndef PRODUCT
384 extern "C" void findpc(intptr_t x);
385 #endif
386
387 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
388 // In order to get locks to work, we need to fake a in_VM state
389 JavaThread* thread = JavaThread::current();
390 JavaThreadState saved_state = thread->thread_state();
391 thread->set_thread_state(_thread_in_vm);
392 if (ShowMessageBoxOnError) {
393 JavaThread* thread = JavaThread::current();
394 JavaThreadState saved_state = thread->thread_state();
395 thread->set_thread_state(_thread_in_vm);
396 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
397 ttyLocker ttyl;
398 BytecodeCounter::print();
399 }
400 // To see where a verify_oop failed, get $ebx+40/X for this frame.
401 // This is the value of eip which points to where verify_oop will return.
402 if (os::message_box(msg, "Execution stopped, print registers?")) {
403 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
404 BREAKPOINT;
405 }
406 }
407 fatal("DEBUG MESSAGE: %s", msg);
408 }
409
410 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
411 ttyLocker ttyl;
412 FlagSetting fs(Debugging, true);
413 tty->print_cr("eip = 0x%08x", eip);
414 #ifndef PRODUCT
415 if ((WizardMode || Verbose) && PrintMiscellaneous) {
416 tty->cr();
417 findpc(eip);
418 tty->cr();
419 }
420 #endif
421 #define PRINT_REG(rax) \
422 { tty->print("%s = ", #rax); os::print_location(tty, rax); }
423 PRINT_REG(rax);
424 PRINT_REG(rbx);
425 PRINT_REG(rcx);
426 PRINT_REG(rdx);
427 PRINT_REG(rdi);
428 PRINT_REG(rsi);
429 PRINT_REG(rbp);
430 PRINT_REG(rsp);
431 #undef PRINT_REG
432 // Print some words near top of staack.
433 int* dump_sp = (int*) rsp;
434 for (int col1 = 0; col1 < 8; col1++) {
435 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
436 os::print_location(tty, *dump_sp++);
437 }
438 for (int row = 0; row < 16; row++) {
439 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
440 for (int col = 0; col < 8; col++) {
441 tty->print(" 0x%08x", *dump_sp++);
442 }
443 tty->cr();
444 }
445 // Print some instructions around pc:
446 Disassembler::decode((address)eip-64, (address)eip);
447 tty->print_cr("--------");
448 Disassembler::decode((address)eip, (address)eip+32);
449 }
450
451 void MacroAssembler::stop(const char* msg) {
452 ExternalAddress message((address)msg);
453 // push address of message
454 pushptr(message.addr());
455 { Label L; call(L, relocInfo::none); bind(L); } // push eip
456 pusha(); // push registers
457 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
458 hlt();
459 }
460
461 void MacroAssembler::warn(const char* msg) {
462 push_CPU_state();
463
464 ExternalAddress message((address) msg);
465 // push address of message
466 pushptr(message.addr());
467
468 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
469 addl(rsp, wordSize); // discard argument
470 pop_CPU_state();
471 }
472
473 void MacroAssembler::print_state() {
474 { Label L; call(L, relocInfo::none); bind(L); } // push eip
475 pusha(); // push registers
476
477 push_CPU_state();
478 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
479 pop_CPU_state();
480
481 popa();
482 addl(rsp, wordSize);
483 }
484
485 #else // _LP64
486
487 // 64 bit versions
488
489 Address MacroAssembler::as_Address(AddressLiteral adr) {
490 // amd64 always does this as a pc-rel
491 // we can be absolute or disp based on the instruction type
492 // jmp/call are displacements others are absolute
493 assert(!adr.is_lval(), "must be rval");
494 assert(reachable(adr), "must be");
495 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
496
497 }
498
499 Address MacroAssembler::as_Address(ArrayAddress adr) {
500 AddressLiteral base = adr.base();
501 lea(rscratch1, base);
502 Address index = adr.index();
503 assert(index._disp == 0, "must not have disp"); // maybe it can?
504 Address array(rscratch1, index._index, index._scale, index._disp);
505 return array;
506 }
507
508 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
509 Label L, E;
510
511 #ifdef _WIN64
512 // Windows always allocates space for it's register args
513 assert(num_args <= 4, "only register arguments supported");
514 subq(rsp, frame::arg_reg_save_area_bytes);
515 #endif
516
517 // Align stack if necessary
518 testl(rsp, 15);
519 jcc(Assembler::zero, L);
520
521 subq(rsp, 8);
522 {
523 call(RuntimeAddress(entry_point));
524 }
525 addq(rsp, 8);
526 jmp(E);
527
528 bind(L);
529 {
530 call(RuntimeAddress(entry_point));
531 }
532
533 bind(E);
534
535 #ifdef _WIN64
536 // restore stack pointer
537 addq(rsp, frame::arg_reg_save_area_bytes);
538 #endif
539
540 }
541
542 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
543 assert(!src2.is_lval(), "should use cmpptr");
544
545 if (reachable(src2)) {
546 cmpq(src1, as_Address(src2));
547 } else {
548 lea(rscratch1, src2);
549 Assembler::cmpq(src1, Address(rscratch1, 0));
550 }
551 }
552
553 int MacroAssembler::corrected_idivq(Register reg) {
554 // Full implementation of Java ldiv and lrem; checks for special
555 // case as described in JVM spec., p.243 & p.271. The function
556 // returns the (pc) offset of the idivl instruction - may be needed
557 // for implicit exceptions.
558 //
559 // normal case special case
560 //
561 // input : rax: dividend min_long
562 // reg: divisor (may not be eax/edx) -1
563 //
564 // output: rax: quotient (= rax idiv reg) min_long
565 // rdx: remainder (= rax irem reg) 0
566 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
567 static const int64_t min_long = 0x8000000000000000;
568 Label normal_case, special_case;
569
570 // check for special case
571 cmp64(rax, ExternalAddress((address) &min_long));
572 jcc(Assembler::notEqual, normal_case);
573 xorl(rdx, rdx); // prepare rdx for possible special case (where
574 // remainder = 0)
575 cmpq(reg, -1);
576 jcc(Assembler::equal, special_case);
577
578 // handle normal case
579 bind(normal_case);
580 cdqq();
581 int idivq_offset = offset();
582 idivq(reg);
583
584 // normal and special case exit
585 bind(special_case);
586
587 return idivq_offset;
588 }
589
590 void MacroAssembler::decrementq(Register reg, int value) {
591 if (value == min_jint) { subq(reg, value); return; }
592 if (value < 0) { incrementq(reg, -value); return; }
593 if (value == 0) { ; return; }
594 if (value == 1 && UseIncDec) { decq(reg) ; return; }
595 /* else */ { subq(reg, value) ; return; }
596 }
597
598 void MacroAssembler::decrementq(Address dst, int value) {
599 if (value == min_jint) { subq(dst, value); return; }
600 if (value < 0) { incrementq(dst, -value); return; }
601 if (value == 0) { ; return; }
602 if (value == 1 && UseIncDec) { decq(dst) ; return; }
603 /* else */ { subq(dst, value) ; return; }
604 }
605
606 void MacroAssembler::incrementq(AddressLiteral dst) {
607 if (reachable(dst)) {
608 incrementq(as_Address(dst));
609 } else {
610 lea(rscratch1, dst);
611 incrementq(Address(rscratch1, 0));
612 }
613 }
614
615 void MacroAssembler::incrementq(Register reg, int value) {
616 if (value == min_jint) { addq(reg, value); return; }
617 if (value < 0) { decrementq(reg, -value); return; }
618 if (value == 0) { ; return; }
619 if (value == 1 && UseIncDec) { incq(reg) ; return; }
620 /* else */ { addq(reg, value) ; return; }
621 }
622
623 void MacroAssembler::incrementq(Address dst, int value) {
624 if (value == min_jint) { addq(dst, value); return; }
625 if (value < 0) { decrementq(dst, -value); return; }
626 if (value == 0) { ; return; }
627 if (value == 1 && UseIncDec) { incq(dst) ; return; }
628 /* else */ { addq(dst, value) ; return; }
629 }
630
631 // 32bit can do a case table jump in one instruction but we no longer allow the base
632 // to be installed in the Address class
633 void MacroAssembler::jump(ArrayAddress entry) {
634 lea(rscratch1, entry.base());
635 Address dispatch = entry.index();
636 assert(dispatch._base == noreg, "must be");
637 dispatch._base = rscratch1;
638 jmp(dispatch);
639 }
640
641 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
642 ShouldNotReachHere(); // 64bit doesn't use two regs
643 cmpq(x_lo, y_lo);
644 }
645
646 void MacroAssembler::lea(Register dst, AddressLiteral src) {
647 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
648 }
649
650 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
651 mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
652 movptr(dst, rscratch1);
653 }
654
655 void MacroAssembler::leave() {
656 // %%% is this really better? Why not on 32bit too?
657 emit_int8((unsigned char)0xC9); // LEAVE
658 }
659
660 void MacroAssembler::lneg(Register hi, Register lo) {
661 ShouldNotReachHere(); // 64bit doesn't use two regs
662 negq(lo);
663 }
664
665 void MacroAssembler::movoop(Register dst, jobject obj) {
666 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
667 }
668
669 void MacroAssembler::movoop(Address dst, jobject obj) {
670 mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
671 movq(dst, rscratch1);
672 }
673
674 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
675 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
676 }
677
678 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
679 mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
680 movq(dst, rscratch1);
681 }
682
683 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
684 if (src.is_lval()) {
685 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
686 } else {
687 if (reachable(src)) {
688 movq(dst, as_Address(src));
689 } else {
690 lea(scratch, src);
691 movq(dst, Address(scratch, 0));
692 }
693 }
694 }
695
696 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
697 movq(as_Address(dst), src);
698 }
699
700 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
701 movq(dst, as_Address(src));
702 }
703
704 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
705 void MacroAssembler::movptr(Address dst, intptr_t src) {
706 if (is_simm32(src)) {
707 movptr(dst, checked_cast<int32_t>(src));
708 } else {
709 mov64(rscratch1, src);
710 movq(dst, rscratch1);
711 }
712 }
713
714 // These are mostly for initializing NULL
715 void MacroAssembler::movptr(Address dst, int32_t src) {
716 movslq(dst, src);
717 }
718
719 void MacroAssembler::movptr(Register dst, int32_t src) {
720 mov64(dst, (intptr_t)src);
721 }
722
723 void MacroAssembler::pushoop(jobject obj) {
724 movoop(rscratch1, obj);
725 push(rscratch1);
726 }
727
728 void MacroAssembler::pushklass(Metadata* obj) {
729 mov_metadata(rscratch1, obj);
730 push(rscratch1);
731 }
732
733 void MacroAssembler::pushptr(AddressLiteral src) {
734 lea(rscratch1, src);
735 if (src.is_lval()) {
736 push(rscratch1);
737 } else {
738 pushq(Address(rscratch1, 0));
739 }
740 }
741
742 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
743 reset_last_Java_frame(r15_thread, clear_fp);
744 }
745
746 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
747 Register last_java_fp,
748 address last_java_pc) {
749 vzeroupper();
750 // determine last_java_sp register
751 if (!last_java_sp->is_valid()) {
752 last_java_sp = rsp;
753 }
754
755 // last_java_fp is optional
756 if (last_java_fp->is_valid()) {
757 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
758 last_java_fp);
759 }
760
761 // last_java_pc is optional
762 if (last_java_pc != NULL) {
763 Address java_pc(r15_thread,
764 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
765 lea(rscratch1, InternalAddress(last_java_pc));
766 movptr(java_pc, rscratch1);
767 }
768
769 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
770 }
771
772 static void pass_arg0(MacroAssembler* masm, Register arg) {
773 if (c_rarg0 != arg ) {
774 masm->mov(c_rarg0, arg);
775 }
776 }
777
778 static void pass_arg1(MacroAssembler* masm, Register arg) {
779 if (c_rarg1 != arg ) {
780 masm->mov(c_rarg1, arg);
781 }
782 }
783
784 static void pass_arg2(MacroAssembler* masm, Register arg) {
785 if (c_rarg2 != arg ) {
786 masm->mov(c_rarg2, arg);
787 }
788 }
789
790 static void pass_arg3(MacroAssembler* masm, Register arg) {
791 if (c_rarg3 != arg ) {
792 masm->mov(c_rarg3, arg);
793 }
794 }
795
796 void MacroAssembler::stop(const char* msg) {
797 if (ShowMessageBoxOnError) {
798 address rip = pc();
799 pusha(); // get regs on stack
800 lea(c_rarg1, InternalAddress(rip));
801 movq(c_rarg2, rsp); // pass pointer to regs array
802 }
803 lea(c_rarg0, ExternalAddress((address) msg));
804 andq(rsp, -16); // align stack as required by ABI
805 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
806 hlt();
807 }
808
809 void MacroAssembler::warn(const char* msg) {
810 push(rbp);
811 movq(rbp, rsp);
812 andq(rsp, -16); // align stack as required by push_CPU_state and call
813 push_CPU_state(); // keeps alignment at 16 bytes
814 lea(c_rarg0, ExternalAddress((address) msg));
815 lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
816 call(rax);
817 pop_CPU_state();
818 mov(rsp, rbp);
819 pop(rbp);
820 }
821
822 void MacroAssembler::print_state() {
823 address rip = pc();
824 pusha(); // get regs on stack
825 push(rbp);
826 movq(rbp, rsp);
827 andq(rsp, -16); // align stack as required by push_CPU_state and call
828 push_CPU_state(); // keeps alignment at 16 bytes
829
830 lea(c_rarg0, InternalAddress(rip));
831 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
832 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
833
834 pop_CPU_state();
835 mov(rsp, rbp);
836 pop(rbp);
837 popa();
838 }
839
840 #ifndef PRODUCT
841 extern "C" void findpc(intptr_t x);
842 #endif
843
844 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
845 // In order to get locks to work, we need to fake a in_VM state
846 if (ShowMessageBoxOnError) {
847 JavaThread* thread = JavaThread::current();
848 JavaThreadState saved_state = thread->thread_state();
849 thread->set_thread_state(_thread_in_vm);
850 #ifndef PRODUCT
851 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
852 ttyLocker ttyl;
853 BytecodeCounter::print();
854 }
855 #endif
856 // To see where a verify_oop failed, get $ebx+40/X for this frame.
857 // XXX correct this offset for amd64
858 // This is the value of eip which points to where verify_oop will return.
859 if (os::message_box(msg, "Execution stopped, print registers?")) {
860 print_state64(pc, regs);
861 BREAKPOINT;
862 }
863 }
864 fatal("DEBUG MESSAGE: %s", msg);
865 }
866
867 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
868 ttyLocker ttyl;
869 FlagSetting fs(Debugging, true);
870 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
871 #ifndef PRODUCT
872 tty->cr();
873 findpc(pc);
874 tty->cr();
875 #endif
876 #define PRINT_REG(rax, value) \
877 { tty->print("%s = ", #rax); os::print_location(tty, value); }
878 PRINT_REG(rax, regs[15]);
879 PRINT_REG(rbx, regs[12]);
880 PRINT_REG(rcx, regs[14]);
881 PRINT_REG(rdx, regs[13]);
882 PRINT_REG(rdi, regs[8]);
883 PRINT_REG(rsi, regs[9]);
884 PRINT_REG(rbp, regs[10]);
885 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
886 PRINT_REG(rsp, (intptr_t)(®s[16]));
887 PRINT_REG(r8 , regs[7]);
888 PRINT_REG(r9 , regs[6]);
889 PRINT_REG(r10, regs[5]);
890 PRINT_REG(r11, regs[4]);
891 PRINT_REG(r12, regs[3]);
892 PRINT_REG(r13, regs[2]);
893 PRINT_REG(r14, regs[1]);
894 PRINT_REG(r15, regs[0]);
895 #undef PRINT_REG
896 // Print some words near the top of the stack.
897 int64_t* rsp = ®s[16];
898 int64_t* dump_sp = rsp;
899 for (int col1 = 0; col1 < 8; col1++) {
900 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
901 os::print_location(tty, *dump_sp++);
902 }
903 for (int row = 0; row < 25; row++) {
904 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
905 for (int col = 0; col < 4; col++) {
906 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
907 }
908 tty->cr();
909 }
910 // Print some instructions around pc:
911 Disassembler::decode((address)pc-64, (address)pc);
912 tty->print_cr("--------");
913 Disassembler::decode((address)pc, (address)pc+32);
914 }
915
916 // The java_calling_convention describes stack locations as ideal slots on
917 // a frame with no abi restrictions. Since we must observe abi restrictions
918 // (like the placement of the register window) the slots must be biased by
919 // the following value.
920 static int reg2offset_in(VMReg r) {
921 // Account for saved rbp and return address
922 // This should really be in_preserve_stack_slots
923 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
924 }
925
926 static int reg2offset_out(VMReg r) {
927 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
928 }
929
930 // A long move
931 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst) {
932
933 // The calling conventions assures us that each VMregpair is either
934 // all really one physical register or adjacent stack slots.
935
936 if (src.is_single_phys_reg() ) {
937 if (dst.is_single_phys_reg()) {
938 if (dst.first() != src.first()) {
939 mov(dst.first()->as_Register(), src.first()->as_Register());
940 }
941 } else {
942 assert(dst.is_single_reg(), "not a stack pair");
943 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
944 }
945 } else if (dst.is_single_phys_reg()) {
946 assert(src.is_single_reg(), "not a stack pair");
947 movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
948 } else {
949 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
950 movq(rax, Address(rbp, reg2offset_in(src.first())));
951 movq(Address(rsp, reg2offset_out(dst.first())), rax);
952 }
953 }
954
955 // A double move
956 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst) {
957
958 // The calling conventions assures us that each VMregpair is either
959 // all really one physical register or adjacent stack slots.
960
961 if (src.is_single_phys_reg() ) {
962 if (dst.is_single_phys_reg()) {
963 // In theory these overlap but the ordering is such that this is likely a nop
964 if ( src.first() != dst.first()) {
965 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
966 }
967 } else {
968 assert(dst.is_single_reg(), "not a stack pair");
969 movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
970 }
971 } else if (dst.is_single_phys_reg()) {
972 assert(src.is_single_reg(), "not a stack pair");
973 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
974 } else {
975 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
976 movq(rax, Address(rbp, reg2offset_in(src.first())));
977 movq(Address(rsp, reg2offset_out(dst.first())), rax);
978 }
979 }
980
981
982 // A float arg may have to do float reg int reg conversion
983 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst) {
984 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
985
986 // The calling conventions assures us that each VMregpair is either
987 // all really one physical register or adjacent stack slots.
988
989 if (src.first()->is_stack()) {
990 if (dst.first()->is_stack()) {
991 movl(rax, Address(rbp, reg2offset_in(src.first())));
992 movptr(Address(rsp, reg2offset_out(dst.first())), rax);
993 } else {
994 // stack to reg
995 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
996 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
997 }
998 } else if (dst.first()->is_stack()) {
999 // reg to stack
1000 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1001 movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1002 } else {
1003 // reg to reg
1004 // In theory these overlap but the ordering is such that this is likely a nop
1005 if ( src.first() != dst.first()) {
1006 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
1007 }
1008 }
1009 }
1010
1011 // On 64 bit we will store integer like items to the stack as
1012 // 64 bits items (x86_32/64 abi) even though java would only store
1013 // 32bits for a parameter. On 32bit it will simply be 32 bits
1014 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
1015 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst) {
1016 if (src.first()->is_stack()) {
1017 if (dst.first()->is_stack()) {
1018 // stack to stack
1019 movslq(rax, Address(rbp, reg2offset_in(src.first())));
1020 movq(Address(rsp, reg2offset_out(dst.first())), rax);
1021 } else {
1022 // stack to reg
1023 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1024 }
1025 } else if (dst.first()->is_stack()) {
1026 // reg to stack
1027 // Do we really have to sign extend???
1028 // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1029 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1030 } else {
1031 // Do we really have to sign extend???
1032 // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1033 if (dst.first() != src.first()) {
1034 movq(dst.first()->as_Register(), src.first()->as_Register());
1035 }
1036 }
1037 }
1038
1039 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1040 if (src.first()->is_stack()) {
1041 if (dst.first()->is_stack()) {
1042 // stack to stack
1043 movq(rax, Address(rbp, reg2offset_in(src.first())));
1044 movq(Address(rsp, reg2offset_out(dst.first())), rax);
1045 } else {
1046 // stack to reg
1047 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1048 }
1049 } else if (dst.first()->is_stack()) {
1050 // reg to stack
1051 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1052 } else {
1053 if (dst.first() != src.first()) {
1054 movq(dst.first()->as_Register(), src.first()->as_Register());
1055 }
1056 }
1057 }
1058
1059 // An oop arg. Must pass a handle not the oop itself
1060 void MacroAssembler::object_move(OopMap* map,
1061 int oop_handle_offset,
1062 int framesize_in_slots,
1063 VMRegPair src,
1064 VMRegPair dst,
1065 bool is_receiver,
1066 int* receiver_offset) {
1067
1068 // must pass a handle. First figure out the location we use as a handle
1069
1070 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1071
1072 // See if oop is NULL if it is we need no handle
1073
1074 if (src.first()->is_stack()) {
1075
1076 // Oop is already on the stack as an argument
1077 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1078 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1079 if (is_receiver) {
1080 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1081 }
1082
1083 cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1084 lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1085 // conditionally move a NULL
1086 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1087 } else {
1088
1089 // Oop is in an a register we must store it to the space we reserve
1090 // on the stack for oop_handles and pass a handle if oop is non-NULL
1091
1092 const Register rOop = src.first()->as_Register();
1093 int oop_slot;
1094 if (rOop == j_rarg0)
1095 oop_slot = 0;
1096 else if (rOop == j_rarg1)
1097 oop_slot = 1;
1098 else if (rOop == j_rarg2)
1099 oop_slot = 2;
1100 else if (rOop == j_rarg3)
1101 oop_slot = 3;
1102 else if (rOop == j_rarg4)
1103 oop_slot = 4;
1104 else {
1105 assert(rOop == j_rarg5, "wrong register");
1106 oop_slot = 5;
1107 }
1108
1109 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1110 int offset = oop_slot*VMRegImpl::stack_slot_size;
1111
1112 map->set_oop(VMRegImpl::stack2reg(oop_slot));
1113 // Store oop in handle area, may be NULL
1114 movptr(Address(rsp, offset), rOop);
1115 if (is_receiver) {
1116 *receiver_offset = offset;
1117 }
1118
1119 cmpptr(rOop, (int32_t)NULL_WORD);
1120 lea(rHandle, Address(rsp, offset));
1121 // conditionally move a NULL from the handle area where it was just stored
1122 cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1123 }
1124
1125 // If arg is on the stack then place it otherwise it is already in correct reg.
1126 if (dst.first()->is_stack()) {
1127 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1128 }
1129 }
1130
1131 #endif // _LP64
1132
1133 // Now versions that are common to 32/64 bit
1134
1135 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1136 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1137 }
1138
1139 void MacroAssembler::addptr(Register dst, Register src) {
1140 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1141 }
1142
1143 void MacroAssembler::addptr(Address dst, Register src) {
1144 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1145 }
1146
1147 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1148 if (reachable(src)) {
1149 Assembler::addsd(dst, as_Address(src));
1150 } else {
1151 lea(rscratch1, src);
1152 Assembler::addsd(dst, Address(rscratch1, 0));
1153 }
1154 }
1155
1156 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1157 if (reachable(src)) {
1158 addss(dst, as_Address(src));
1159 } else {
1160 lea(rscratch1, src);
1161 addss(dst, Address(rscratch1, 0));
1162 }
1163 }
1164
1165 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1166 if (reachable(src)) {
1167 Assembler::addpd(dst, as_Address(src));
1168 } else {
1169 lea(rscratch1, src);
1170 Assembler::addpd(dst, Address(rscratch1, 0));
1171 }
1172 }
1173
1174 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only.
1175 // Stub code is generated once and never copied.
1176 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1177 void MacroAssembler::align64() {
1178 align(64, (unsigned long long) pc());
1179 }
1180
1181 void MacroAssembler::align32() {
1182 align(32, (unsigned long long) pc());
1183 }
1184
1185 void MacroAssembler::align(int modulus) {
1186 // 8273459: Ensure alignment is possible with current segment alignment
1187 assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1188 align(modulus, offset());
1189 }
1190
1191 void MacroAssembler::align(int modulus, int target) {
1192 if (target % modulus != 0) {
1193 nop(modulus - (target % modulus));
1194 }
1195 }
1196
1197 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1198 // Used in sign-masking with aligned address.
1199 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1200 if (reachable(src)) {
1201 Assembler::andpd(dst, as_Address(src));
1202 } else {
1203 lea(scratch_reg, src);
1204 Assembler::andpd(dst, Address(scratch_reg, 0));
1205 }
1206 }
1207
1208 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1209 // Used in sign-masking with aligned address.
1210 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1211 if (reachable(src)) {
1212 Assembler::andps(dst, as_Address(src));
1213 } else {
1214 lea(scratch_reg, src);
1215 Assembler::andps(dst, Address(scratch_reg, 0));
1216 }
1217 }
1218
1219 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1220 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1221 }
1222
1223 void MacroAssembler::atomic_incl(Address counter_addr) {
1224 lock();
1225 incrementl(counter_addr);
1226 }
1227
1228 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1229 if (reachable(counter_addr)) {
1230 atomic_incl(as_Address(counter_addr));
1231 } else {
1232 lea(scr, counter_addr);
1233 atomic_incl(Address(scr, 0));
1234 }
1235 }
1236
1237 #ifdef _LP64
1238 void MacroAssembler::atomic_incq(Address counter_addr) {
1239 lock();
1240 incrementq(counter_addr);
1241 }
1242
1243 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1244 if (reachable(counter_addr)) {
1245 atomic_incq(as_Address(counter_addr));
1246 } else {
1247 lea(scr, counter_addr);
1248 atomic_incq(Address(scr, 0));
1249 }
1250 }
1251 #endif
1252
1253 // Writes to stack successive pages until offset reached to check for
1254 // stack overflow + shadow pages. This clobbers tmp.
1255 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1256 movptr(tmp, rsp);
1257 // Bang stack for total size given plus shadow page size.
1258 // Bang one page at a time because large size can bang beyond yellow and
1259 // red zones.
1260 Label loop;
1261 bind(loop);
1262 movl(Address(tmp, (-os::vm_page_size())), size );
1263 subptr(tmp, os::vm_page_size());
1264 subl(size, os::vm_page_size());
1265 jcc(Assembler::greater, loop);
1266
1267 // Bang down shadow pages too.
1268 // At this point, (tmp-0) is the last address touched, so don't
1269 // touch it again. (It was touched as (tmp-pagesize) but then tmp
1270 // was post-decremented.) Skip this address by starting at i=1, and
1271 // touch a few more pages below. N.B. It is important to touch all
1272 // the way down including all pages in the shadow zone.
1273 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1274 // this could be any sized move but this is can be a debugging crumb
1275 // so the bigger the better.
1276 movptr(Address(tmp, (-i*os::vm_page_size())), size );
1277 }
1278 }
1279
1280 void MacroAssembler::reserved_stack_check() {
1281 // testing if reserved zone needs to be enabled
1282 Label no_reserved_zone_enabling;
1283 Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1284 NOT_LP64(get_thread(rsi);)
1285
1286 cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1287 jcc(Assembler::below, no_reserved_zone_enabling);
1288
1289 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1290 jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1291 should_not_reach_here();
1292
1293 bind(no_reserved_zone_enabling);
1294 }
1295
1296 void MacroAssembler::biased_locking_enter(Register lock_reg,
1297 Register obj_reg,
1298 Register swap_reg,
1299 Register tmp_reg,
1300 Register tmp_reg2,
1301 bool swap_reg_contains_mark,
1302 Label& done,
1303 Label* slow_case,
1304 BiasedLockingCounters* counters) {
1305 assert(UseBiasedLocking, "why call this otherwise?");
1306 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1307 assert(tmp_reg != noreg, "tmp_reg must be supplied");
1308 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1309 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
1310 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
1311 NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1312
1313 if (PrintBiasedLockingStatistics && counters == NULL) {
1314 counters = BiasedLocking::counters();
1315 }
1316 // Biased locking
1317 // See whether the lock is currently biased toward our thread and
1318 // whether the epoch is still valid
1319 // Note that the runtime guarantees sufficient alignment of JavaThread
1320 // pointers to allow age to be placed into low bits
1321 // First check to see whether biasing is even enabled for this object
1322 Label cas_label;
1323 if (!swap_reg_contains_mark) {
1324 movptr(swap_reg, mark_addr);
1325 }
1326 movptr(tmp_reg, swap_reg);
1327 andptr(tmp_reg, markWord::biased_lock_mask_in_place);
1328 cmpptr(tmp_reg, markWord::biased_lock_pattern);
1329 jcc(Assembler::notEqual, cas_label);
1330 // The bias pattern is present in the object's header. Need to check
1331 // whether the bias owner and the epoch are both still current.
1332 #ifndef _LP64
1333 // Note that because there is no current thread register on x86_32 we
1334 // need to store off the mark word we read out of the object to
1335 // avoid reloading it and needing to recheck invariants below. This
1336 // store is unfortunate but it makes the overall code shorter and
1337 // simpler.
1338 movptr(saved_mark_addr, swap_reg);
1339 #endif
1340 load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1341 #ifdef _LP64
1342 orptr(tmp_reg, r15_thread);
1343 xorptr(tmp_reg, swap_reg);
1344 Register header_reg = tmp_reg;
1345 #else
1346 xorptr(tmp_reg, swap_reg);
1347 get_thread(swap_reg);
1348 xorptr(swap_reg, tmp_reg);
1349 Register header_reg = swap_reg;
1350 #endif
1351 andptr(header_reg, ~((int) markWord::age_mask_in_place));
1352 if (counters != NULL) {
1353 cond_inc32(Assembler::zero,
1354 ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1355 }
1356 jcc(Assembler::equal, done);
1357
1358 Label try_revoke_bias;
1359 Label try_rebias;
1360
1361 // At this point we know that the header has the bias pattern and
1362 // that we are not the bias owner in the current epoch. We need to
1363 // figure out more details about the state of the header in order to
1364 // know what operations can be legally performed on the object's
1365 // header.
1366
1367 // If the low three bits in the xor result aren't clear, that means
1368 // the prototype header is no longer biased and we have to revoke
1369 // the bias on this object.
1370 testptr(header_reg, markWord::biased_lock_mask_in_place);
1371 jcc(Assembler::notZero, try_revoke_bias);
1372
1373 // Biasing is still enabled for this data type. See whether the
1374 // epoch of the current bias is still valid, meaning that the epoch
1375 // bits of the mark word are equal to the epoch bits of the
1376 // prototype header. (Note that the prototype header's epoch bits
1377 // only change at a safepoint.) If not, attempt to rebias the object
1378 // toward the current thread. Note that we must be absolutely sure
1379 // that the current epoch is invalid in order to do this because
1380 // otherwise the manipulations it performs on the mark word are
1381 // illegal.
1382 testptr(header_reg, markWord::epoch_mask_in_place);
1383 jccb(Assembler::notZero, try_rebias);
1384
1385 // The epoch of the current bias is still valid but we know nothing
1386 // about the owner; it might be set or it might be clear. Try to
1387 // acquire the bias of the object using an atomic operation. If this
1388 // fails we will go in to the runtime to revoke the object's bias.
1389 // Note that we first construct the presumed unbiased header so we
1390 // don't accidentally blow away another thread's valid bias.
1391 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1392 andptr(swap_reg,
1393 markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
1394 #ifdef _LP64
1395 movptr(tmp_reg, swap_reg);
1396 orptr(tmp_reg, r15_thread);
1397 #else
1398 get_thread(tmp_reg);
1399 orptr(tmp_reg, swap_reg);
1400 #endif
1401 lock();
1402 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1403 // If the biasing toward our thread failed, this means that
1404 // another thread succeeded in biasing it toward itself and we
1405 // need to revoke that bias. The revocation will occur in the
1406 // interpreter runtime in the slow case.
1407 if (counters != NULL) {
1408 cond_inc32(Assembler::zero,
1409 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1410 }
1411 if (slow_case != NULL) {
1412 jcc(Assembler::notZero, *slow_case);
1413 }
1414 jmp(done);
1415
1416 bind(try_rebias);
1417 // At this point we know the epoch has expired, meaning that the
1418 // current "bias owner", if any, is actually invalid. Under these
1419 // circumstances _only_, we are allowed to use the current header's
1420 // value as the comparison value when doing the cas to acquire the
1421 // bias in the current epoch. In other words, we allow transfer of
1422 // the bias from one thread to another directly in this situation.
1423 //
1424 // FIXME: due to a lack of registers we currently blow away the age
1425 // bits in this situation. Should attempt to preserve them.
1426 load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1427 #ifdef _LP64
1428 orptr(tmp_reg, r15_thread);
1429 #else
1430 get_thread(swap_reg);
1431 orptr(tmp_reg, swap_reg);
1432 movptr(swap_reg, saved_mark_addr);
1433 #endif
1434 lock();
1435 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1436 // If the biasing toward our thread failed, then another thread
1437 // succeeded in biasing it toward itself and we need to revoke that
1438 // bias. The revocation will occur in the runtime in the slow case.
1439 if (counters != NULL) {
1440 cond_inc32(Assembler::zero,
1441 ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1442 }
1443 if (slow_case != NULL) {
1444 jcc(Assembler::notZero, *slow_case);
1445 }
1446 jmp(done);
1447
1448 bind(try_revoke_bias);
1449 // The prototype mark in the klass doesn't have the bias bit set any
1450 // more, indicating that objects of this data type are not supposed
1451 // to be biased any more. We are going to try to reset the mark of
1452 // this object to the prototype value and fall through to the
1453 // CAS-based locking scheme. Note that if our CAS fails, it means
1454 // that another thread raced us for the privilege of revoking the
1455 // bias of this particular object, so it's okay to continue in the
1456 // normal locking code.
1457 //
1458 // FIXME: due to a lack of registers we currently blow away the age
1459 // bits in this situation. Should attempt to preserve them.
1460 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1461 load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1462 lock();
1463 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1464 // Fall through to the normal CAS-based lock, because no matter what
1465 // the result of the above CAS, some thread must have succeeded in
1466 // removing the bias bit from the object's header.
1467 if (counters != NULL) {
1468 cond_inc32(Assembler::zero,
1469 ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1470 }
1471
1472 bind(cas_label);
1473 }
1474
1475 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1476 assert(UseBiasedLocking, "why call this otherwise?");
1477
1478 // Check for biased locking unlock case, which is a no-op
1479 // Note: we do not have to check the thread ID for two reasons.
1480 // First, the interpreter checks for IllegalMonitorStateException at
1481 // a higher level. Second, if the bias was revoked while we held the
1482 // lock, the object could not be rebiased toward another thread, so
1483 // the bias bit would be clear.
1484 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1485 andptr(temp_reg, markWord::biased_lock_mask_in_place);
1486 cmpptr(temp_reg, markWord::biased_lock_pattern);
1487 jcc(Assembler::equal, done);
1488 }
1489
1490 void MacroAssembler::c2bool(Register x) {
1491 // implements x == 0 ? 0 : 1
1492 // note: must only look at least-significant byte of x
1493 // since C-style booleans are stored in one byte
1494 // only! (was bug)
1495 andl(x, 0xFF);
1496 setb(Assembler::notZero, x);
1497 }
1498
1499 // Wouldn't need if AddressLiteral version had new name
1500 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1501 Assembler::call(L, rtype);
1502 }
1503
1504 void MacroAssembler::call(Register entry) {
1505 Assembler::call(entry);
1506 }
1507
1508 void MacroAssembler::call(AddressLiteral entry) {
1509 if (reachable(entry)) {
1510 Assembler::call_literal(entry.target(), entry.rspec());
1511 } else {
1512 lea(rscratch1, entry);
1513 Assembler::call(rscratch1);
1514 }
1515 }
1516
1517 void MacroAssembler::ic_call(address entry, jint method_index) {
1518 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1519 movptr(rax, (intptr_t)Universe::non_oop_word());
1520 call(AddressLiteral(entry, rh));
1521 }
1522
1523 // Implementation of call_VM versions
1524
1525 void MacroAssembler::call_VM(Register oop_result,
1526 address entry_point,
1527 bool check_exceptions) {
1528 Label C, E;
1529 call(C, relocInfo::none);
1530 jmp(E);
1531
1532 bind(C);
1533 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1534 ret(0);
1535
1536 bind(E);
1537 }
1538
1539 void MacroAssembler::call_VM(Register oop_result,
1540 address entry_point,
1541 Register arg_1,
1542 bool check_exceptions) {
1543 Label C, E;
1544 call(C, relocInfo::none);
1545 jmp(E);
1546
1547 bind(C);
1548 pass_arg1(this, arg_1);
1549 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1550 ret(0);
1551
1552 bind(E);
1553 }
1554
1555 void MacroAssembler::call_VM(Register oop_result,
1556 address entry_point,
1557 Register arg_1,
1558 Register arg_2,
1559 bool check_exceptions) {
1560 Label C, E;
1561 call(C, relocInfo::none);
1562 jmp(E);
1563
1564 bind(C);
1565
1566 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1567
1568 pass_arg2(this, arg_2);
1569 pass_arg1(this, arg_1);
1570 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1571 ret(0);
1572
1573 bind(E);
1574 }
1575
1576 void MacroAssembler::call_VM(Register oop_result,
1577 address entry_point,
1578 Register arg_1,
1579 Register arg_2,
1580 Register arg_3,
1581 bool check_exceptions) {
1582 Label C, E;
1583 call(C, relocInfo::none);
1584 jmp(E);
1585
1586 bind(C);
1587
1588 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1589 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1590 pass_arg3(this, arg_3);
1591
1592 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1593 pass_arg2(this, arg_2);
1594
1595 pass_arg1(this, arg_1);
1596 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1597 ret(0);
1598
1599 bind(E);
1600 }
1601
1602 void MacroAssembler::call_VM(Register oop_result,
1603 Register last_java_sp,
1604 address entry_point,
1605 int number_of_arguments,
1606 bool check_exceptions) {
1607 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1608 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1609 }
1610
1611 void MacroAssembler::call_VM(Register oop_result,
1612 Register last_java_sp,
1613 address entry_point,
1614 Register arg_1,
1615 bool check_exceptions) {
1616 pass_arg1(this, arg_1);
1617 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1618 }
1619
1620 void MacroAssembler::call_VM(Register oop_result,
1621 Register last_java_sp,
1622 address entry_point,
1623 Register arg_1,
1624 Register arg_2,
1625 bool check_exceptions) {
1626
1627 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1628 pass_arg2(this, arg_2);
1629 pass_arg1(this, arg_1);
1630 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1631 }
1632
1633 void MacroAssembler::call_VM(Register oop_result,
1634 Register last_java_sp,
1635 address entry_point,
1636 Register arg_1,
1637 Register arg_2,
1638 Register arg_3,
1639 bool check_exceptions) {
1640 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1641 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1642 pass_arg3(this, arg_3);
1643 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1644 pass_arg2(this, arg_2);
1645 pass_arg1(this, arg_1);
1646 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1647 }
1648
1649 void MacroAssembler::super_call_VM(Register oop_result,
1650 Register last_java_sp,
1651 address entry_point,
1652 int number_of_arguments,
1653 bool check_exceptions) {
1654 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1655 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1656 }
1657
1658 void MacroAssembler::super_call_VM(Register oop_result,
1659 Register last_java_sp,
1660 address entry_point,
1661 Register arg_1,
1662 bool check_exceptions) {
1663 pass_arg1(this, arg_1);
1664 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1665 }
1666
1667 void MacroAssembler::super_call_VM(Register oop_result,
1668 Register last_java_sp,
1669 address entry_point,
1670 Register arg_1,
1671 Register arg_2,
1672 bool check_exceptions) {
1673
1674 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1675 pass_arg2(this, arg_2);
1676 pass_arg1(this, arg_1);
1677 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1678 }
1679
1680 void MacroAssembler::super_call_VM(Register oop_result,
1681 Register last_java_sp,
1682 address entry_point,
1683 Register arg_1,
1684 Register arg_2,
1685 Register arg_3,
1686 bool check_exceptions) {
1687 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1688 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1689 pass_arg3(this, arg_3);
1690 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1691 pass_arg2(this, arg_2);
1692 pass_arg1(this, arg_1);
1693 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1694 }
1695
1696 void MacroAssembler::call_VM_base(Register oop_result,
1697 Register java_thread,
1698 Register last_java_sp,
1699 address entry_point,
1700 int number_of_arguments,
1701 bool check_exceptions) {
1702 // determine java_thread register
1703 if (!java_thread->is_valid()) {
1704 #ifdef _LP64
1705 java_thread = r15_thread;
1706 #else
1707 java_thread = rdi;
1708 get_thread(java_thread);
1709 #endif // LP64
1710 }
1711 // determine last_java_sp register
1712 if (!last_java_sp->is_valid()) {
1713 last_java_sp = rsp;
1714 }
1715 // debugging support
1716 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1717 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1718 #ifdef ASSERT
1719 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1720 // r12 is the heapbase.
1721 LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1722 #endif // ASSERT
1723
1724 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1725 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1726
1727 // push java thread (becomes first argument of C function)
1728
1729 NOT_LP64(push(java_thread); number_of_arguments++);
1730 LP64_ONLY(mov(c_rarg0, r15_thread));
1731
1732 // set last Java frame before call
1733 assert(last_java_sp != rbp, "can't use ebp/rbp");
1734
1735 // Only interpreter should have to set fp
1736 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1737
1738 // do the call, remove parameters
1739 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1740
1741 // restore the thread (cannot use the pushed argument since arguments
1742 // may be overwritten by C code generated by an optimizing compiler);
1743 // however can use the register value directly if it is callee saved.
1744 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1745 // rdi & rsi (also r15) are callee saved -> nothing to do
1746 #ifdef ASSERT
1747 guarantee(java_thread != rax, "change this code");
1748 push(rax);
1749 { Label L;
1750 get_thread(rax);
1751 cmpptr(java_thread, rax);
1752 jcc(Assembler::equal, L);
1753 STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1754 bind(L);
1755 }
1756 pop(rax);
1757 #endif
1758 } else {
1759 get_thread(java_thread);
1760 }
1761 // reset last Java frame
1762 // Only interpreter should have to clear fp
1763 reset_last_Java_frame(java_thread, true);
1764
1765 // C++ interp handles this in the interpreter
1766 check_and_handle_popframe(java_thread);
1767 check_and_handle_earlyret(java_thread);
1768
1769 if (check_exceptions) {
1770 // check for pending exceptions (java_thread is set upon return)
1771 cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1772 #ifndef _LP64
1773 jump_cc(Assembler::notEqual,
1774 RuntimeAddress(StubRoutines::forward_exception_entry()));
1775 #else
1776 // This used to conditionally jump to forward_exception however it is
1777 // possible if we relocate that the branch will not reach. So we must jump
1778 // around so we can always reach
1779
1780 Label ok;
1781 jcc(Assembler::equal, ok);
1782 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1783 bind(ok);
1784 #endif // LP64
1785 }
1786
1787 // get oop result if there is one and reset the value in the thread
1788 if (oop_result->is_valid()) {
1789 get_vm_result(oop_result, java_thread);
1790 }
1791 }
1792
1793 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1794
1795 // Calculate the value for last_Java_sp
1796 // somewhat subtle. call_VM does an intermediate call
1797 // which places a return address on the stack just under the
1798 // stack pointer as the user finsihed with it. This allows
1799 // use to retrieve last_Java_pc from last_Java_sp[-1].
1800 // On 32bit we then have to push additional args on the stack to accomplish
1801 // the actual requested call. On 64bit call_VM only can use register args
1802 // so the only extra space is the return address that call_VM created.
1803 // This hopefully explains the calculations here.
1804
1805 #ifdef _LP64
1806 // We've pushed one address, correct last_Java_sp
1807 lea(rax, Address(rsp, wordSize));
1808 #else
1809 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1810 #endif // LP64
1811
1812 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1813
1814 }
1815
1816 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1817 void MacroAssembler::call_VM_leaf0(address entry_point) {
1818 MacroAssembler::call_VM_leaf_base(entry_point, 0);
1819 }
1820
1821 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1822 call_VM_leaf_base(entry_point, number_of_arguments);
1823 }
1824
1825 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1826 pass_arg0(this, arg_0);
1827 call_VM_leaf(entry_point, 1);
1828 }
1829
1830 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1831
1832 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1833 pass_arg1(this, arg_1);
1834 pass_arg0(this, arg_0);
1835 call_VM_leaf(entry_point, 2);
1836 }
1837
1838 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1839 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1840 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1841 pass_arg2(this, arg_2);
1842 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1843 pass_arg1(this, arg_1);
1844 pass_arg0(this, arg_0);
1845 call_VM_leaf(entry_point, 3);
1846 }
1847
1848 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1849 pass_arg0(this, arg_0);
1850 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1851 }
1852
1853 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1854
1855 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1856 pass_arg1(this, arg_1);
1857 pass_arg0(this, arg_0);
1858 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1859 }
1860
1861 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1862 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1863 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1864 pass_arg2(this, arg_2);
1865 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1866 pass_arg1(this, arg_1);
1867 pass_arg0(this, arg_0);
1868 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1869 }
1870
1871 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1872 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1873 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1874 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1875 pass_arg3(this, arg_3);
1876 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1877 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1878 pass_arg2(this, arg_2);
1879 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1880 pass_arg1(this, arg_1);
1881 pass_arg0(this, arg_0);
1882 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1883 }
1884
1885 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1886 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1887 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1888 verify_oop_msg(oop_result, "broken oop in call_VM_base");
1889 }
1890
1891 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1892 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1893 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1894 }
1895
1896 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1897 }
1898
1899 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1900 }
1901
1902 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1903 if (reachable(src1)) {
1904 cmpl(as_Address(src1), imm);
1905 } else {
1906 lea(rscratch1, src1);
1907 cmpl(Address(rscratch1, 0), imm);
1908 }
1909 }
1910
1911 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1912 assert(!src2.is_lval(), "use cmpptr");
1913 if (reachable(src2)) {
1914 cmpl(src1, as_Address(src2));
1915 } else {
1916 lea(rscratch1, src2);
1917 cmpl(src1, Address(rscratch1, 0));
1918 }
1919 }
1920
1921 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1922 Assembler::cmpl(src1, imm);
1923 }
1924
1925 void MacroAssembler::cmp32(Register src1, Address src2) {
1926 Assembler::cmpl(src1, src2);
1927 }
1928
1929 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1930 ucomisd(opr1, opr2);
1931
1932 Label L;
1933 if (unordered_is_less) {
1934 movl(dst, -1);
1935 jcc(Assembler::parity, L);
1936 jcc(Assembler::below , L);
1937 movl(dst, 0);
1938 jcc(Assembler::equal , L);
1939 increment(dst);
1940 } else { // unordered is greater
1941 movl(dst, 1);
1942 jcc(Assembler::parity, L);
1943 jcc(Assembler::above , L);
1944 movl(dst, 0);
1945 jcc(Assembler::equal , L);
1946 decrementl(dst);
1947 }
1948 bind(L);
1949 }
1950
1951 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1952 ucomiss(opr1, opr2);
1953
1954 Label L;
1955 if (unordered_is_less) {
1956 movl(dst, -1);
1957 jcc(Assembler::parity, L);
1958 jcc(Assembler::below , L);
1959 movl(dst, 0);
1960 jcc(Assembler::equal , L);
1961 increment(dst);
1962 } else { // unordered is greater
1963 movl(dst, 1);
1964 jcc(Assembler::parity, L);
1965 jcc(Assembler::above , L);
1966 movl(dst, 0);
1967 jcc(Assembler::equal , L);
1968 decrementl(dst);
1969 }
1970 bind(L);
1971 }
1972
1973
1974 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1975 if (reachable(src1)) {
1976 cmpb(as_Address(src1), imm);
1977 } else {
1978 lea(rscratch1, src1);
1979 cmpb(Address(rscratch1, 0), imm);
1980 }
1981 }
1982
1983 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1984 #ifdef _LP64
1985 if (src2.is_lval()) {
1986 movptr(rscratch1, src2);
1987 Assembler::cmpq(src1, rscratch1);
1988 } else if (reachable(src2)) {
1989 cmpq(src1, as_Address(src2));
1990 } else {
1991 lea(rscratch1, src2);
1992 Assembler::cmpq(src1, Address(rscratch1, 0));
1993 }
1994 #else
1995 if (src2.is_lval()) {
1996 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1997 } else {
1998 cmpl(src1, as_Address(src2));
1999 }
2000 #endif // _LP64
2001 }
2002
2003 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2004 assert(src2.is_lval(), "not a mem-mem compare");
2005 #ifdef _LP64
2006 // moves src2's literal address
2007 movptr(rscratch1, src2);
2008 Assembler::cmpq(src1, rscratch1);
2009 #else
2010 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2011 #endif // _LP64
2012 }
2013
2014 void MacroAssembler::cmpoop(Register src1, Register src2) {
2015 cmpptr(src1, src2);
2016 }
2017
2018 void MacroAssembler::cmpoop(Register src1, Address src2) {
2019 cmpptr(src1, src2);
2020 }
2021
2022 #ifdef _LP64
2023 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2024 movoop(rscratch1, src2);
2025 cmpptr(src1, rscratch1);
2026 }
2027 #endif
2028
2029 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2030 if (reachable(adr)) {
2031 lock();
2032 cmpxchgptr(reg, as_Address(adr));
2033 } else {
2034 lea(rscratch1, adr);
2035 lock();
2036 cmpxchgptr(reg, Address(rscratch1, 0));
2037 }
2038 }
2039
2040 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2041 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2042 }
2043
2044 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2045 if (reachable(src)) {
2046 Assembler::comisd(dst, as_Address(src));
2047 } else {
2048 lea(rscratch1, src);
2049 Assembler::comisd(dst, Address(rscratch1, 0));
2050 }
2051 }
2052
2053 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2054 if (reachable(src)) {
2055 Assembler::comiss(dst, as_Address(src));
2056 } else {
2057 lea(rscratch1, src);
2058 Assembler::comiss(dst, Address(rscratch1, 0));
2059 }
2060 }
2061
2062
2063 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2064 Condition negated_cond = negate_condition(cond);
2065 Label L;
2066 jcc(negated_cond, L);
2067 pushf(); // Preserve flags
2068 atomic_incl(counter_addr);
2069 popf();
2070 bind(L);
2071 }
2072
2073 int MacroAssembler::corrected_idivl(Register reg) {
2074 // Full implementation of Java idiv and irem; checks for
2075 // special case as described in JVM spec., p.243 & p.271.
2076 // The function returns the (pc) offset of the idivl
2077 // instruction - may be needed for implicit exceptions.
2078 //
2079 // normal case special case
2080 //
2081 // input : rax,: dividend min_int
2082 // reg: divisor (may not be rax,/rdx) -1
2083 //
2084 // output: rax,: quotient (= rax, idiv reg) min_int
2085 // rdx: remainder (= rax, irem reg) 0
2086 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2087 const int min_int = 0x80000000;
2088 Label normal_case, special_case;
2089
2090 // check for special case
2091 cmpl(rax, min_int);
2092 jcc(Assembler::notEqual, normal_case);
2093 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2094 cmpl(reg, -1);
2095 jcc(Assembler::equal, special_case);
2096
2097 // handle normal case
2098 bind(normal_case);
2099 cdql();
2100 int idivl_offset = offset();
2101 idivl(reg);
2102
2103 // normal and special case exit
2104 bind(special_case);
2105
2106 return idivl_offset;
2107 }
2108
2109
2110
2111 void MacroAssembler::decrementl(Register reg, int value) {
2112 if (value == min_jint) {subl(reg, value) ; return; }
2113 if (value < 0) { incrementl(reg, -value); return; }
2114 if (value == 0) { ; return; }
2115 if (value == 1 && UseIncDec) { decl(reg) ; return; }
2116 /* else */ { subl(reg, value) ; return; }
2117 }
2118
2119 void MacroAssembler::decrementl(Address dst, int value) {
2120 if (value == min_jint) {subl(dst, value) ; return; }
2121 if (value < 0) { incrementl(dst, -value); return; }
2122 if (value == 0) { ; return; }
2123 if (value == 1 && UseIncDec) { decl(dst) ; return; }
2124 /* else */ { subl(dst, value) ; return; }
2125 }
2126
2127 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2128 assert (shift_value > 0, "illegal shift value");
2129 Label _is_positive;
2130 testl (reg, reg);
2131 jcc (Assembler::positive, _is_positive);
2132 int offset = (1 << shift_value) - 1 ;
2133
2134 if (offset == 1) {
2135 incrementl(reg);
2136 } else {
2137 addl(reg, offset);
2138 }
2139
2140 bind (_is_positive);
2141 sarl(reg, shift_value);
2142 }
2143
2144 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2145 if (reachable(src)) {
2146 Assembler::divsd(dst, as_Address(src));
2147 } else {
2148 lea(rscratch1, src);
2149 Assembler::divsd(dst, Address(rscratch1, 0));
2150 }
2151 }
2152
2153 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2154 if (reachable(src)) {
2155 Assembler::divss(dst, as_Address(src));
2156 } else {
2157 lea(rscratch1, src);
2158 Assembler::divss(dst, Address(rscratch1, 0));
2159 }
2160 }
2161
2162 void MacroAssembler::enter() {
2163 push(rbp);
2164 mov(rbp, rsp);
2165 }
2166
2167 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2168 void MacroAssembler::fat_nop() {
2169 if (UseAddressNop) {
2170 addr_nop_5();
2171 } else {
2172 emit_int8(0x26); // es:
2173 emit_int8(0x2e); // cs:
2174 emit_int8(0x64); // fs:
2175 emit_int8(0x65); // gs:
2176 emit_int8((unsigned char)0x90);
2177 }
2178 }
2179
2180 #ifndef _LP64
2181 void MacroAssembler::fcmp(Register tmp) {
2182 fcmp(tmp, 1, true, true);
2183 }
2184
2185 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2186 assert(!pop_right || pop_left, "usage error");
2187 if (VM_Version::supports_cmov()) {
2188 assert(tmp == noreg, "unneeded temp");
2189 if (pop_left) {
2190 fucomip(index);
2191 } else {
2192 fucomi(index);
2193 }
2194 if (pop_right) {
2195 fpop();
2196 }
2197 } else {
2198 assert(tmp != noreg, "need temp");
2199 if (pop_left) {
2200 if (pop_right) {
2201 fcompp();
2202 } else {
2203 fcomp(index);
2204 }
2205 } else {
2206 fcom(index);
2207 }
2208 // convert FPU condition into eflags condition via rax,
2209 save_rax(tmp);
2210 fwait(); fnstsw_ax();
2211 sahf();
2212 restore_rax(tmp);
2213 }
2214 // condition codes set as follows:
2215 //
2216 // CF (corresponds to C0) if x < y
2217 // PF (corresponds to C2) if unordered
2218 // ZF (corresponds to C3) if x = y
2219 }
2220
2221 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2222 fcmp2int(dst, unordered_is_less, 1, true, true);
2223 }
2224
2225 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2226 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2227 Label L;
2228 if (unordered_is_less) {
2229 movl(dst, -1);
2230 jcc(Assembler::parity, L);
2231 jcc(Assembler::below , L);
2232 movl(dst, 0);
2233 jcc(Assembler::equal , L);
2234 increment(dst);
2235 } else { // unordered is greater
2236 movl(dst, 1);
2237 jcc(Assembler::parity, L);
2238 jcc(Assembler::above , L);
2239 movl(dst, 0);
2240 jcc(Assembler::equal , L);
2241 decrementl(dst);
2242 }
2243 bind(L);
2244 }
2245
2246 void MacroAssembler::fld_d(AddressLiteral src) {
2247 fld_d(as_Address(src));
2248 }
2249
2250 void MacroAssembler::fld_s(AddressLiteral src) {
2251 fld_s(as_Address(src));
2252 }
2253
2254 void MacroAssembler::fldcw(AddressLiteral src) {
2255 Assembler::fldcw(as_Address(src));
2256 }
2257
2258 void MacroAssembler::fpop() {
2259 ffree();
2260 fincstp();
2261 }
2262
2263 void MacroAssembler::fremr(Register tmp) {
2264 save_rax(tmp);
2265 { Label L;
2266 bind(L);
2267 fprem();
2268 fwait(); fnstsw_ax();
2269 sahf();
2270 jcc(Assembler::parity, L);
2271 }
2272 restore_rax(tmp);
2273 // Result is in ST0.
2274 // Note: fxch & fpop to get rid of ST1
2275 // (otherwise FPU stack could overflow eventually)
2276 fxch(1);
2277 fpop();
2278 }
2279
2280 void MacroAssembler::empty_FPU_stack() {
2281 if (VM_Version::supports_mmx()) {
2282 emms();
2283 } else {
2284 for (int i = 8; i-- > 0; ) ffree(i);
2285 }
2286 }
2287 #endif // !LP64
2288
2289 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2290 if (reachable(src)) {
2291 Assembler::mulpd(dst, as_Address(src));
2292 } else {
2293 lea(rscratch1, src);
2294 Assembler::mulpd(dst, Address(rscratch1, 0));
2295 }
2296 }
2297
2298 void MacroAssembler::load_float(Address src) {
2299 #ifdef _LP64
2300 movflt(xmm0, src);
2301 #else
2302 if (UseSSE >= 1) {
2303 movflt(xmm0, src);
2304 } else {
2305 fld_s(src);
2306 }
2307 #endif // LP64
2308 }
2309
2310 void MacroAssembler::store_float(Address dst) {
2311 #ifdef _LP64
2312 movflt(dst, xmm0);
2313 #else
2314 if (UseSSE >= 1) {
2315 movflt(dst, xmm0);
2316 } else {
2317 fstp_s(dst);
2318 }
2319 #endif // LP64
2320 }
2321
2322 void MacroAssembler::load_double(Address src) {
2323 #ifdef _LP64
2324 movdbl(xmm0, src);
2325 #else
2326 if (UseSSE >= 2) {
2327 movdbl(xmm0, src);
2328 } else {
2329 fld_d(src);
2330 }
2331 #endif // LP64
2332 }
2333
2334 void MacroAssembler::store_double(Address dst) {
2335 #ifdef _LP64
2336 movdbl(dst, xmm0);
2337 #else
2338 if (UseSSE >= 2) {
2339 movdbl(dst, xmm0);
2340 } else {
2341 fstp_d(dst);
2342 }
2343 #endif // LP64
2344 }
2345
2346 // dst = c = a * b + c
2347 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2348 Assembler::vfmadd231sd(c, a, b);
2349 if (dst != c) {
2350 movdbl(dst, c);
2351 }
2352 }
2353
2354 // dst = c = a * b + c
2355 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2356 Assembler::vfmadd231ss(c, a, b);
2357 if (dst != c) {
2358 movflt(dst, c);
2359 }
2360 }
2361
2362 // dst = c = a * b + c
2363 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2364 Assembler::vfmadd231pd(c, a, b, vector_len);
2365 if (dst != c) {
2366 vmovdqu(dst, c);
2367 }
2368 }
2369
2370 // dst = c = a * b + c
2371 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2372 Assembler::vfmadd231ps(c, a, b, vector_len);
2373 if (dst != c) {
2374 vmovdqu(dst, c);
2375 }
2376 }
2377
2378 // dst = c = a * b + c
2379 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2380 Assembler::vfmadd231pd(c, a, b, vector_len);
2381 if (dst != c) {
2382 vmovdqu(dst, c);
2383 }
2384 }
2385
2386 // dst = c = a * b + c
2387 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2388 Assembler::vfmadd231ps(c, a, b, vector_len);
2389 if (dst != c) {
2390 vmovdqu(dst, c);
2391 }
2392 }
2393
2394 void MacroAssembler::incrementl(AddressLiteral dst) {
2395 if (reachable(dst)) {
2396 incrementl(as_Address(dst));
2397 } else {
2398 lea(rscratch1, dst);
2399 incrementl(Address(rscratch1, 0));
2400 }
2401 }
2402
2403 void MacroAssembler::incrementl(ArrayAddress dst) {
2404 incrementl(as_Address(dst));
2405 }
2406
2407 void MacroAssembler::incrementl(Register reg, int value) {
2408 if (value == min_jint) {addl(reg, value) ; return; }
2409 if (value < 0) { decrementl(reg, -value); return; }
2410 if (value == 0) { ; return; }
2411 if (value == 1 && UseIncDec) { incl(reg) ; return; }
2412 /* else */ { addl(reg, value) ; return; }
2413 }
2414
2415 void MacroAssembler::incrementl(Address dst, int value) {
2416 if (value == min_jint) {addl(dst, value) ; return; }
2417 if (value < 0) { decrementl(dst, -value); return; }
2418 if (value == 0) { ; return; }
2419 if (value == 1 && UseIncDec) { incl(dst) ; return; }
2420 /* else */ { addl(dst, value) ; return; }
2421 }
2422
2423 void MacroAssembler::jump(AddressLiteral dst) {
2424 if (reachable(dst)) {
2425 jmp_literal(dst.target(), dst.rspec());
2426 } else {
2427 lea(rscratch1, dst);
2428 jmp(rscratch1);
2429 }
2430 }
2431
2432 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2433 if (reachable(dst)) {
2434 InstructionMark im(this);
2435 relocate(dst.reloc());
2436 const int short_size = 2;
2437 const int long_size = 6;
2438 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2439 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2440 // 0111 tttn #8-bit disp
2441 emit_int8(0x70 | cc);
2442 emit_int8((offs - short_size) & 0xFF);
2443 } else {
2444 // 0000 1111 1000 tttn #32-bit disp
2445 emit_int8(0x0F);
2446 emit_int8((unsigned char)(0x80 | cc));
2447 emit_int32(offs - long_size);
2448 }
2449 } else {
2450 #ifdef ASSERT
2451 warning("reversing conditional branch");
2452 #endif /* ASSERT */
2453 Label skip;
2454 jccb(reverse[cc], skip);
2455 lea(rscratch1, dst);
2456 Assembler::jmp(rscratch1);
2457 bind(skip);
2458 }
2459 }
2460
2461 void MacroAssembler::fld_x(AddressLiteral src) {
2462 Assembler::fld_x(as_Address(src));
2463 }
2464
2465 void MacroAssembler::ldmxcsr(AddressLiteral src) {
2466 if (reachable(src)) {
2467 Assembler::ldmxcsr(as_Address(src));
2468 } else {
2469 lea(rscratch1, src);
2470 Assembler::ldmxcsr(Address(rscratch1, 0));
2471 }
2472 }
2473
2474 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2475 int off;
2476 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2477 off = offset();
2478 movsbl(dst, src); // movsxb
2479 } else {
2480 off = load_unsigned_byte(dst, src);
2481 shll(dst, 24);
2482 sarl(dst, 24);
2483 }
2484 return off;
2485 }
2486
2487 // Note: load_signed_short used to be called load_signed_word.
2488 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2489 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2490 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2491 int MacroAssembler::load_signed_short(Register dst, Address src) {
2492 int off;
2493 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2494 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2495 // version but this is what 64bit has always done. This seems to imply
2496 // that users are only using 32bits worth.
2497 off = offset();
2498 movswl(dst, src); // movsxw
2499 } else {
2500 off = load_unsigned_short(dst, src);
2501 shll(dst, 16);
2502 sarl(dst, 16);
2503 }
2504 return off;
2505 }
2506
2507 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2508 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2509 // and "3.9 Partial Register Penalties", p. 22).
2510 int off;
2511 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2512 off = offset();
2513 movzbl(dst, src); // movzxb
2514 } else {
2515 xorl(dst, dst);
2516 off = offset();
2517 movb(dst, src);
2518 }
2519 return off;
2520 }
2521
2522 // Note: load_unsigned_short used to be called load_unsigned_word.
2523 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2524 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2525 // and "3.9 Partial Register Penalties", p. 22).
2526 int off;
2527 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2528 off = offset();
2529 movzwl(dst, src); // movzxw
2530 } else {
2531 xorl(dst, dst);
2532 off = offset();
2533 movw(dst, src);
2534 }
2535 return off;
2536 }
2537
2538 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2539 switch (size_in_bytes) {
2540 #ifndef _LP64
2541 case 8:
2542 assert(dst2 != noreg, "second dest register required");
2543 movl(dst, src);
2544 movl(dst2, src.plus_disp(BytesPerInt));
2545 break;
2546 #else
2547 case 8: movq(dst, src); break;
2548 #endif
2549 case 4: movl(dst, src); break;
2550 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2551 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2552 default: ShouldNotReachHere();
2553 }
2554 }
2555
2556 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2557 switch (size_in_bytes) {
2558 #ifndef _LP64
2559 case 8:
2560 assert(src2 != noreg, "second source register required");
2561 movl(dst, src);
2562 movl(dst.plus_disp(BytesPerInt), src2);
2563 break;
2564 #else
2565 case 8: movq(dst, src); break;
2566 #endif
2567 case 4: movl(dst, src); break;
2568 case 2: movw(dst, src); break;
2569 case 1: movb(dst, src); break;
2570 default: ShouldNotReachHere();
2571 }
2572 }
2573
2574 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2575 if (reachable(dst)) {
2576 movl(as_Address(dst), src);
2577 } else {
2578 lea(rscratch1, dst);
2579 movl(Address(rscratch1, 0), src);
2580 }
2581 }
2582
2583 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2584 if (reachable(src)) {
2585 movl(dst, as_Address(src));
2586 } else {
2587 lea(rscratch1, src);
2588 movl(dst, Address(rscratch1, 0));
2589 }
2590 }
2591
2592 // C++ bool manipulation
2593
2594 void MacroAssembler::movbool(Register dst, Address src) {
2595 if(sizeof(bool) == 1)
2596 movb(dst, src);
2597 else if(sizeof(bool) == 2)
2598 movw(dst, src);
2599 else if(sizeof(bool) == 4)
2600 movl(dst, src);
2601 else
2602 // unsupported
2603 ShouldNotReachHere();
2604 }
2605
2606 void MacroAssembler::movbool(Address dst, bool boolconst) {
2607 if(sizeof(bool) == 1)
2608 movb(dst, (int) boolconst);
2609 else if(sizeof(bool) == 2)
2610 movw(dst, (int) boolconst);
2611 else if(sizeof(bool) == 4)
2612 movl(dst, (int) boolconst);
2613 else
2614 // unsupported
2615 ShouldNotReachHere();
2616 }
2617
2618 void MacroAssembler::movbool(Address dst, Register src) {
2619 if(sizeof(bool) == 1)
2620 movb(dst, src);
2621 else if(sizeof(bool) == 2)
2622 movw(dst, src);
2623 else if(sizeof(bool) == 4)
2624 movl(dst, src);
2625 else
2626 // unsupported
2627 ShouldNotReachHere();
2628 }
2629
2630 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2631 movb(as_Address(dst), src);
2632 }
2633
2634 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2635 if (reachable(src)) {
2636 movdl(dst, as_Address(src));
2637 } else {
2638 lea(rscratch1, src);
2639 movdl(dst, Address(rscratch1, 0));
2640 }
2641 }
2642
2643 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2644 if (reachable(src)) {
2645 movq(dst, as_Address(src));
2646 } else {
2647 lea(rscratch1, src);
2648 movq(dst, Address(rscratch1, 0));
2649 }
2650 }
2651
2652 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2653 if (reachable(src)) {
2654 if (UseXmmLoadAndClearUpper) {
2655 movsd (dst, as_Address(src));
2656 } else {
2657 movlpd(dst, as_Address(src));
2658 }
2659 } else {
2660 lea(rscratch1, src);
2661 if (UseXmmLoadAndClearUpper) {
2662 movsd (dst, Address(rscratch1, 0));
2663 } else {
2664 movlpd(dst, Address(rscratch1, 0));
2665 }
2666 }
2667 }
2668
2669 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2670 if (reachable(src)) {
2671 movss(dst, as_Address(src));
2672 } else {
2673 lea(rscratch1, src);
2674 movss(dst, Address(rscratch1, 0));
2675 }
2676 }
2677
2678 void MacroAssembler::movptr(Register dst, Register src) {
2679 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2680 }
2681
2682 void MacroAssembler::movptr(Register dst, Address src) {
2683 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2684 }
2685
2686 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2687 void MacroAssembler::movptr(Register dst, intptr_t src) {
2688 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2689 }
2690
2691 void MacroAssembler::movptr(Address dst, Register src) {
2692 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2693 }
2694
2695 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2696 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2697 Assembler::movdqu(dst, src);
2698 }
2699
2700 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2701 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2702 Assembler::movdqu(dst, src);
2703 }
2704
2705 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2706 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2707 Assembler::movdqu(dst, src);
2708 }
2709
2710 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2711 if (reachable(src)) {
2712 movdqu(dst, as_Address(src));
2713 } else {
2714 lea(scratchReg, src);
2715 movdqu(dst, Address(scratchReg, 0));
2716 }
2717 }
2718
2719 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2720 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2721 Assembler::vmovdqu(dst, src);
2722 }
2723
2724 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2725 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2726 Assembler::vmovdqu(dst, src);
2727 }
2728
2729 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2730 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2731 Assembler::vmovdqu(dst, src);
2732 }
2733
2734 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2735 if (reachable(src)) {
2736 vmovdqu(dst, as_Address(src));
2737 }
2738 else {
2739 lea(scratch_reg, src);
2740 vmovdqu(dst, Address(scratch_reg, 0));
2741 }
2742 }
2743
2744 void MacroAssembler::kmov(KRegister dst, Address src) {
2745 if (VM_Version::supports_avx512bw()) {
2746 kmovql(dst, src);
2747 } else {
2748 assert(VM_Version::supports_evex(), "");
2749 kmovwl(dst, src);
2750 }
2751 }
2752
2753 void MacroAssembler::kmov(Address dst, KRegister src) {
2754 if (VM_Version::supports_avx512bw()) {
2755 kmovql(dst, src);
2756 } else {
2757 assert(VM_Version::supports_evex(), "");
2758 kmovwl(dst, src);
2759 }
2760 }
2761
2762 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2763 if (VM_Version::supports_avx512bw()) {
2764 kmovql(dst, src);
2765 } else {
2766 assert(VM_Version::supports_evex(), "");
2767 kmovwl(dst, src);
2768 }
2769 }
2770
2771 void MacroAssembler::kmov(Register dst, KRegister src) {
2772 if (VM_Version::supports_avx512bw()) {
2773 kmovql(dst, src);
2774 } else {
2775 assert(VM_Version::supports_evex(), "");
2776 kmovwl(dst, src);
2777 }
2778 }
2779
2780 void MacroAssembler::kmov(KRegister dst, Register src) {
2781 if (VM_Version::supports_avx512bw()) {
2782 kmovql(dst, src);
2783 } else {
2784 assert(VM_Version::supports_evex(), "");
2785 kmovwl(dst, src);
2786 }
2787 }
2788
2789 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2790 if (reachable(src)) {
2791 kmovql(dst, as_Address(src));
2792 } else {
2793 lea(scratch_reg, src);
2794 kmovql(dst, Address(scratch_reg, 0));
2795 }
2796 }
2797
2798 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2799 if (reachable(src)) {
2800 kmovwl(dst, as_Address(src));
2801 } else {
2802 lea(scratch_reg, src);
2803 kmovwl(dst, Address(scratch_reg, 0));
2804 }
2805 }
2806
2807 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2808 int vector_len, Register scratch_reg) {
2809 if (reachable(src)) {
2810 if (mask == k0) {
2811 Assembler::evmovdqub(dst, as_Address(src), merge, vector_len);
2812 } else {
2813 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2814 }
2815 } else {
2816 lea(scratch_reg, src);
2817 if (mask == k0) {
2818 Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len);
2819 } else {
2820 Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2821 }
2822 }
2823 }
2824
2825 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2826 int vector_len, Register scratch_reg) {
2827 if (reachable(src)) {
2828 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2829 } else {
2830 lea(scratch_reg, src);
2831 Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2832 }
2833 }
2834
2835 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2836 int vector_len, Register scratch_reg) {
2837 if (reachable(src)) {
2838 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2839 } else {
2840 lea(scratch_reg, src);
2841 Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2842 }
2843 }
2844
2845 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2846 int vector_len, Register scratch_reg) {
2847 if (reachable(src)) {
2848 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2849 } else {
2850 lea(scratch_reg, src);
2851 Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2852 }
2853 }
2854
2855 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2856 if (reachable(src)) {
2857 Assembler::evmovdquq(dst, as_Address(src), vector_len);
2858 } else {
2859 lea(rscratch, src);
2860 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2861 }
2862 }
2863
2864 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2865 if (reachable(src)) {
2866 Assembler::movdqa(dst, as_Address(src));
2867 } else {
2868 lea(rscratch1, src);
2869 Assembler::movdqa(dst, Address(rscratch1, 0));
2870 }
2871 }
2872
2873 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2874 if (reachable(src)) {
2875 Assembler::movsd(dst, as_Address(src));
2876 } else {
2877 lea(rscratch1, src);
2878 Assembler::movsd(dst, Address(rscratch1, 0));
2879 }
2880 }
2881
2882 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2883 if (reachable(src)) {
2884 Assembler::movss(dst, as_Address(src));
2885 } else {
2886 lea(rscratch1, src);
2887 Assembler::movss(dst, Address(rscratch1, 0));
2888 }
2889 }
2890
2891 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2892 if (reachable(src)) {
2893 Assembler::mulsd(dst, as_Address(src));
2894 } else {
2895 lea(rscratch1, src);
2896 Assembler::mulsd(dst, Address(rscratch1, 0));
2897 }
2898 }
2899
2900 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2901 if (reachable(src)) {
2902 Assembler::mulss(dst, as_Address(src));
2903 } else {
2904 lea(rscratch1, src);
2905 Assembler::mulss(dst, Address(rscratch1, 0));
2906 }
2907 }
2908
2909 void MacroAssembler::null_check(Register reg, int offset) {
2910 if (needs_explicit_null_check(offset)) {
2911 // provoke OS NULL exception if reg = NULL by
2912 // accessing M[reg] w/o changing any (non-CC) registers
2913 // NOTE: cmpl is plenty here to provoke a segv
2914 cmpptr(rax, Address(reg, 0));
2915 // Note: should probably use testl(rax, Address(reg, 0));
2916 // may be shorter code (however, this version of
2917 // testl needs to be implemented first)
2918 } else {
2919 // nothing to do, (later) access of M[reg + offset]
2920 // will provoke OS NULL exception if reg = NULL
2921 }
2922 }
2923
2924 void MacroAssembler::os_breakpoint() {
2925 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2926 // (e.g., MSVC can't call ps() otherwise)
2927 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2928 }
2929
2930 void MacroAssembler::unimplemented(const char* what) {
2931 const char* buf = NULL;
2932 {
2933 ResourceMark rm;
2934 stringStream ss;
2935 ss.print("unimplemented: %s", what);
2936 buf = code_string(ss.as_string());
2937 }
2938 stop(buf);
2939 }
2940
2941 #ifdef _LP64
2942 #define XSTATE_BV 0x200
2943 #endif
2944
2945 void MacroAssembler::pop_CPU_state() {
2946 pop_FPU_state();
2947 pop_IU_state();
2948 }
2949
2950 void MacroAssembler::pop_FPU_state() {
2951 #ifndef _LP64
2952 frstor(Address(rsp, 0));
2953 #else
2954 fxrstor(Address(rsp, 0));
2955 #endif
2956 addptr(rsp, FPUStateSizeInWords * wordSize);
2957 }
2958
2959 void MacroAssembler::pop_IU_state() {
2960 popa();
2961 LP64_ONLY(addq(rsp, 8));
2962 popf();
2963 }
2964
2965 // Save Integer and Float state
2966 // Warning: Stack must be 16 byte aligned (64bit)
2967 void MacroAssembler::push_CPU_state() {
2968 push_IU_state();
2969 push_FPU_state();
2970 }
2971
2972 void MacroAssembler::push_FPU_state() {
2973 subptr(rsp, FPUStateSizeInWords * wordSize);
2974 #ifndef _LP64
2975 fnsave(Address(rsp, 0));
2976 fwait();
2977 #else
2978 fxsave(Address(rsp, 0));
2979 #endif // LP64
2980 }
2981
2982 void MacroAssembler::push_IU_state() {
2983 // Push flags first because pusha kills them
2984 pushf();
2985 // Make sure rsp stays 16-byte aligned
2986 LP64_ONLY(subq(rsp, 8));
2987 pusha();
2988 }
2989
2990 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2991 if (!java_thread->is_valid()) {
2992 java_thread = rdi;
2993 get_thread(java_thread);
2994 }
2995 // we must set sp to zero to clear frame
2996 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2997 // must clear fp, so that compiled frames are not confused; it is
2998 // possible that we need it only for debugging
2999 if (clear_fp) {
3000 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3001 }
3002 // Always clear the pc because it could have been set by make_walkable()
3003 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3004 vzeroupper();
3005 }
3006
3007 void MacroAssembler::restore_rax(Register tmp) {
3008 if (tmp == noreg) pop(rax);
3009 else if (tmp != rax) mov(rax, tmp);
3010 }
3011
3012 void MacroAssembler::round_to(Register reg, int modulus) {
3013 addptr(reg, modulus - 1);
3014 andptr(reg, -modulus);
3015 }
3016
3017 void MacroAssembler::save_rax(Register tmp) {
3018 if (tmp == noreg) push(rax);
3019 else if (tmp != rax) mov(tmp, rax);
3020 }
3021
3022 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
3023 if (at_return) {
3024 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
3025 // we may safely use rsp instead to perform the stack watermark check.
3026 cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
3027 jcc(Assembler::above, slow_path);
3028 return;
3029 }
3030 testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
3031 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3032 }
3033
3034 // Calls to C land
3035 //
3036 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3037 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3038 // has to be reset to 0. This is required to allow proper stack traversal.
3039 void MacroAssembler::set_last_Java_frame(Register java_thread,
3040 Register last_java_sp,
3041 Register last_java_fp,
3042 address last_java_pc) {
3043 vzeroupper();
3044 // determine java_thread register
3045 if (!java_thread->is_valid()) {
3046 java_thread = rdi;
3047 get_thread(java_thread);
3048 }
3049 // determine last_java_sp register
3050 if (!last_java_sp->is_valid()) {
3051 last_java_sp = rsp;
3052 }
3053
3054 // last_java_fp is optional
3055
3056 if (last_java_fp->is_valid()) {
3057 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3058 }
3059
3060 // last_java_pc is optional
3061
3062 if (last_java_pc != NULL) {
3063 lea(Address(java_thread,
3064 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3065 InternalAddress(last_java_pc));
3066
3067 }
3068 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3069 }
3070
3071 void MacroAssembler::shlptr(Register dst, int imm8) {
3072 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3073 }
3074
3075 void MacroAssembler::shrptr(Register dst, int imm8) {
3076 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3077 }
3078
3079 void MacroAssembler::sign_extend_byte(Register reg) {
3080 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3081 movsbl(reg, reg); // movsxb
3082 } else {
3083 shll(reg, 24);
3084 sarl(reg, 24);
3085 }
3086 }
3087
3088 void MacroAssembler::sign_extend_short(Register reg) {
3089 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3090 movswl(reg, reg); // movsxw
3091 } else {
3092 shll(reg, 16);
3093 sarl(reg, 16);
3094 }
3095 }
3096
3097 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3098 assert(reachable(src), "Address should be reachable");
3099 testl(dst, as_Address(src));
3100 }
3101
3102 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3103 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3104 Assembler::pcmpeqb(dst, src);
3105 }
3106
3107 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3108 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3109 Assembler::pcmpeqw(dst, src);
3110 }
3111
3112 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3113 assert((dst->encoding() < 16),"XMM register should be 0-15");
3114 Assembler::pcmpestri(dst, src, imm8);
3115 }
3116
3117 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3118 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3119 Assembler::pcmpestri(dst, src, imm8);
3120 }
3121
3122 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3123 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3124 Assembler::pmovzxbw(dst, src);
3125 }
3126
3127 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3128 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3129 Assembler::pmovzxbw(dst, src);
3130 }
3131
3132 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3133 assert((src->encoding() < 16),"XMM register should be 0-15");
3134 Assembler::pmovmskb(dst, src);
3135 }
3136
3137 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3138 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3139 Assembler::ptest(dst, src);
3140 }
3141
3142 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3143 if (reachable(src)) {
3144 Assembler::sqrtsd(dst, as_Address(src));
3145 } else {
3146 lea(rscratch1, src);
3147 Assembler::sqrtsd(dst, Address(rscratch1, 0));
3148 }
3149 }
3150
3151 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3152 if (reachable(src)) {
3153 Assembler::sqrtss(dst, as_Address(src));
3154 } else {
3155 lea(rscratch1, src);
3156 Assembler::sqrtss(dst, Address(rscratch1, 0));
3157 }
3158 }
3159
3160 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3161 if (reachable(src)) {
3162 Assembler::subsd(dst, as_Address(src));
3163 } else {
3164 lea(rscratch1, src);
3165 Assembler::subsd(dst, Address(rscratch1, 0));
3166 }
3167 }
3168
3169 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
3170 if (reachable(src)) {
3171 Assembler::roundsd(dst, as_Address(src), rmode);
3172 } else {
3173 lea(scratch_reg, src);
3174 Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
3175 }
3176 }
3177
3178 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3179 if (reachable(src)) {
3180 Assembler::subss(dst, as_Address(src));
3181 } else {
3182 lea(rscratch1, src);
3183 Assembler::subss(dst, Address(rscratch1, 0));
3184 }
3185 }
3186
3187 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3188 if (reachable(src)) {
3189 Assembler::ucomisd(dst, as_Address(src));
3190 } else {
3191 lea(rscratch1, src);
3192 Assembler::ucomisd(dst, Address(rscratch1, 0));
3193 }
3194 }
3195
3196 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3197 if (reachable(src)) {
3198 Assembler::ucomiss(dst, as_Address(src));
3199 } else {
3200 lea(rscratch1, src);
3201 Assembler::ucomiss(dst, Address(rscratch1, 0));
3202 }
3203 }
3204
3205 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3206 // Used in sign-bit flipping with aligned address.
3207 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3208 if (reachable(src)) {
3209 Assembler::xorpd(dst, as_Address(src));
3210 } else {
3211 lea(scratch_reg, src);
3212 Assembler::xorpd(dst, Address(scratch_reg, 0));
3213 }
3214 }
3215
3216 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3217 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3218 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3219 }
3220 else {
3221 Assembler::xorpd(dst, src);
3222 }
3223 }
3224
3225 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3226 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3227 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3228 } else {
3229 Assembler::xorps(dst, src);
3230 }
3231 }
3232
3233 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3234 // Used in sign-bit flipping with aligned address.
3235 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3236 if (reachable(src)) {
3237 Assembler::xorps(dst, as_Address(src));
3238 } else {
3239 lea(scratch_reg, src);
3240 Assembler::xorps(dst, Address(scratch_reg, 0));
3241 }
3242 }
3243
3244 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3245 // Used in sign-bit flipping with aligned address.
3246 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3247 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3248 if (reachable(src)) {
3249 Assembler::pshufb(dst, as_Address(src));
3250 } else {
3251 lea(rscratch1, src);
3252 Assembler::pshufb(dst, Address(rscratch1, 0));
3253 }
3254 }
3255
3256 // AVX 3-operands instructions
3257
3258 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3259 if (reachable(src)) {
3260 vaddsd(dst, nds, as_Address(src));
3261 } else {
3262 lea(rscratch1, src);
3263 vaddsd(dst, nds, Address(rscratch1, 0));
3264 }
3265 }
3266
3267 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3268 if (reachable(src)) {
3269 vaddss(dst, nds, as_Address(src));
3270 } else {
3271 lea(rscratch1, src);
3272 vaddss(dst, nds, Address(rscratch1, 0));
3273 }
3274 }
3275
3276 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3277 assert(UseAVX > 0, "requires some form of AVX");
3278 if (reachable(src)) {
3279 Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3280 } else {
3281 lea(rscratch, src);
3282 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3283 }
3284 }
3285
3286 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3287 assert(UseAVX > 0, "requires some form of AVX");
3288 if (reachable(src)) {
3289 Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3290 } else {
3291 lea(rscratch, src);
3292 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3293 }
3294 }
3295
3296 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3297 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3298 vandps(dst, nds, negate_field, vector_len);
3299 }
3300
3301 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3302 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3303 vandpd(dst, nds, negate_field, vector_len);
3304 }
3305
3306 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3307 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3308 Assembler::vpaddb(dst, nds, src, vector_len);
3309 }
3310
3311 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3312 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3313 Assembler::vpaddb(dst, nds, src, vector_len);
3314 }
3315
3316 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3317 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3318 Assembler::vpaddw(dst, nds, src, vector_len);
3319 }
3320
3321 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3322 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3323 Assembler::vpaddw(dst, nds, src, vector_len);
3324 }
3325
3326 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3327 if (reachable(src)) {
3328 Assembler::vpand(dst, nds, as_Address(src), vector_len);
3329 } else {
3330 lea(scratch_reg, src);
3331 Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3332 }
3333 }
3334
3335 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3336 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3337 Assembler::vpbroadcastw(dst, src, vector_len);
3338 }
3339
3340 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3341 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3342 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3343 }
3344
3345 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3346 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3347 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3348 }
3349
3350 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3351 AddressLiteral src, int vector_len, Register scratch_reg) {
3352 if (reachable(src)) {
3353 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3354 } else {
3355 lea(scratch_reg, src);
3356 Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3357 }
3358 }
3359
3360 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3361 int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3362 if (reachable(src)) {
3363 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3364 } else {
3365 lea(scratch_reg, src);
3366 Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3367 }
3368 }
3369
3370 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3371 int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3372 if (reachable(src)) {
3373 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3374 } else {
3375 lea(scratch_reg, src);
3376 Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3377 }
3378 }
3379
3380 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3381 int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3382 if (reachable(src)) {
3383 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3384 } else {
3385 lea(scratch_reg, src);
3386 Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3387 }
3388 }
3389
3390 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3391 int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3392 if (reachable(src)) {
3393 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3394 } else {
3395 lea(scratch_reg, src);
3396 Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3397 }
3398 }
3399
3400 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3401 if (width == Assembler::Q) {
3402 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3403 } else {
3404 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3405 }
3406 }
3407
3408 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg) {
3409 int eq_cond_enc = 0x29;
3410 int gt_cond_enc = 0x37;
3411 if (width != Assembler::Q) {
3412 eq_cond_enc = 0x74 + width;
3413 gt_cond_enc = 0x64 + width;
3414 }
3415 switch (cond) {
3416 case eq:
3417 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3418 break;
3419 case neq:
3420 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3421 vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3422 break;
3423 case le:
3424 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3425 vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3426 break;
3427 case nlt:
3428 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3429 vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3430 break;
3431 case lt:
3432 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3433 break;
3434 case nle:
3435 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3436 break;
3437 default:
3438 assert(false, "Should not reach here");
3439 }
3440 }
3441
3442 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3443 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3444 Assembler::vpmovzxbw(dst, src, vector_len);
3445 }
3446
3447 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3448 assert((src->encoding() < 16),"XMM register should be 0-15");
3449 Assembler::vpmovmskb(dst, src, vector_len);
3450 }
3451
3452 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3453 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3454 Assembler::vpmullw(dst, nds, src, vector_len);
3455 }
3456
3457 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3458 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3459 Assembler::vpmullw(dst, nds, src, vector_len);
3460 }
3461
3462 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3463 assert((UseAVX > 0), "AVX support is needed");
3464 if (reachable(src)) {
3465 Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3466 } else {
3467 lea(scratch_reg, src);
3468 Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3469 }
3470 }
3471
3472 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3473 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3474 Assembler::vpsubb(dst, nds, src, vector_len);
3475 }
3476
3477 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3478 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3479 Assembler::vpsubb(dst, nds, src, vector_len);
3480 }
3481
3482 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3483 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3484 Assembler::vpsubw(dst, nds, src, vector_len);
3485 }
3486
3487 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3488 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3489 Assembler::vpsubw(dst, nds, src, vector_len);
3490 }
3491
3492 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3493 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3494 Assembler::vpsraw(dst, nds, shift, vector_len);
3495 }
3496
3497 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3498 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3499 Assembler::vpsraw(dst, nds, shift, vector_len);
3500 }
3501
3502 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3503 assert(UseAVX > 2,"");
3504 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3505 vector_len = 2;
3506 }
3507 Assembler::evpsraq(dst, nds, shift, vector_len);
3508 }
3509
3510 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3511 assert(UseAVX > 2,"");
3512 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3513 vector_len = 2;
3514 }
3515 Assembler::evpsraq(dst, nds, shift, vector_len);
3516 }
3517
3518 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3519 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3520 Assembler::vpsrlw(dst, nds, shift, vector_len);
3521 }
3522
3523 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3524 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3525 Assembler::vpsrlw(dst, nds, shift, vector_len);
3526 }
3527
3528 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3529 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3530 Assembler::vpsllw(dst, nds, shift, vector_len);
3531 }
3532
3533 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3534 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3535 Assembler::vpsllw(dst, nds, shift, vector_len);
3536 }
3537
3538 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3539 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3540 Assembler::vptest(dst, src);
3541 }
3542
3543 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3544 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3545 Assembler::punpcklbw(dst, src);
3546 }
3547
3548 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3549 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3550 Assembler::pshufd(dst, src, mode);
3551 }
3552
3553 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3554 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3555 Assembler::pshuflw(dst, src, mode);
3556 }
3557
3558 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3559 if (reachable(src)) {
3560 vandpd(dst, nds, as_Address(src), vector_len);
3561 } else {
3562 lea(scratch_reg, src);
3563 vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3564 }
3565 }
3566
3567 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3568 if (reachable(src)) {
3569 vandps(dst, nds, as_Address(src), vector_len);
3570 } else {
3571 lea(scratch_reg, src);
3572 vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3573 }
3574 }
3575
3576 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3577 bool merge, int vector_len, Register scratch_reg) {
3578 if (reachable(src)) {
3579 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3580 } else {
3581 lea(scratch_reg, src);
3582 Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3583 }
3584 }
3585
3586 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3587 if (reachable(src)) {
3588 vdivsd(dst, nds, as_Address(src));
3589 } else {
3590 lea(rscratch1, src);
3591 vdivsd(dst, nds, Address(rscratch1, 0));
3592 }
3593 }
3594
3595 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3596 if (reachable(src)) {
3597 vdivss(dst, nds, as_Address(src));
3598 } else {
3599 lea(rscratch1, src);
3600 vdivss(dst, nds, Address(rscratch1, 0));
3601 }
3602 }
3603
3604 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3605 if (reachable(src)) {
3606 vmulsd(dst, nds, as_Address(src));
3607 } else {
3608 lea(rscratch1, src);
3609 vmulsd(dst, nds, Address(rscratch1, 0));
3610 }
3611 }
3612
3613 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3614 if (reachable(src)) {
3615 vmulss(dst, nds, as_Address(src));
3616 } else {
3617 lea(rscratch1, src);
3618 vmulss(dst, nds, Address(rscratch1, 0));
3619 }
3620 }
3621
3622 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3623 if (reachable(src)) {
3624 vsubsd(dst, nds, as_Address(src));
3625 } else {
3626 lea(rscratch1, src);
3627 vsubsd(dst, nds, Address(rscratch1, 0));
3628 }
3629 }
3630
3631 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3632 if (reachable(src)) {
3633 vsubss(dst, nds, as_Address(src));
3634 } else {
3635 lea(rscratch1, src);
3636 vsubss(dst, nds, Address(rscratch1, 0));
3637 }
3638 }
3639
3640 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3641 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3642 vxorps(dst, nds, src, Assembler::AVX_128bit);
3643 }
3644
3645 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3646 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3647 vxorpd(dst, nds, src, Assembler::AVX_128bit);
3648 }
3649
3650 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3651 if (reachable(src)) {
3652 vxorpd(dst, nds, as_Address(src), vector_len);
3653 } else {
3654 lea(scratch_reg, src);
3655 vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3656 }
3657 }
3658
3659 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3660 if (reachable(src)) {
3661 vxorps(dst, nds, as_Address(src), vector_len);
3662 } else {
3663 lea(scratch_reg, src);
3664 vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3665 }
3666 }
3667
3668 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3669 if (UseAVX > 1 || (vector_len < 1)) {
3670 if (reachable(src)) {
3671 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3672 } else {
3673 lea(scratch_reg, src);
3674 Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3675 }
3676 }
3677 else {
3678 MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3679 }
3680 }
3681
3682 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3683 if (reachable(src)) {
3684 Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3685 } else {
3686 lea(scratch_reg, src);
3687 Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3688 }
3689 }
3690
3691 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3692 const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3693 STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3694 // The inverted mask is sign-extended
3695 andptr(possibly_jweak, inverted_jweak_mask);
3696 }
3697
3698 void MacroAssembler::resolve_jobject(Register value,
3699 Register thread,
3700 Register tmp) {
3701 assert_different_registers(value, thread, tmp);
3702 Label done, not_weak;
3703 testptr(value, value);
3704 jcc(Assembler::zero, done); // Use NULL as-is.
3705 testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3706 jcc(Assembler::zero, not_weak);
3707 // Resolve jweak.
3708 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3709 value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3710 verify_oop(value);
3711 jmp(done);
3712 bind(not_weak);
3713 // Resolve (untagged) jobject.
3714 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3715 verify_oop(value);
3716 bind(done);
3717 }
3718
3719 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3720 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3721 }
3722
3723 // Force generation of a 4 byte immediate value even if it fits into 8bit
3724 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3725 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3726 }
3727
3728 void MacroAssembler::subptr(Register dst, Register src) {
3729 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3730 }
3731
3732 // C++ bool manipulation
3733 void MacroAssembler::testbool(Register dst) {
3734 if(sizeof(bool) == 1)
3735 testb(dst, 0xff);
3736 else if(sizeof(bool) == 2) {
3737 // testw implementation needed for two byte bools
3738 ShouldNotReachHere();
3739 } else if(sizeof(bool) == 4)
3740 testl(dst, dst);
3741 else
3742 // unsupported
3743 ShouldNotReachHere();
3744 }
3745
3746 void MacroAssembler::testptr(Register dst, Register src) {
3747 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3748 }
3749
3750 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3751 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3752 Register var_size_in_bytes,
3753 int con_size_in_bytes,
3754 Register t1,
3755 Register t2,
3756 Label& slow_case) {
3757 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3758 bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3759 }
3760
3761 // Defines obj, preserves var_size_in_bytes
3762 void MacroAssembler::eden_allocate(Register thread, Register obj,
3763 Register var_size_in_bytes,
3764 int con_size_in_bytes,
3765 Register t1,
3766 Label& slow_case) {
3767 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3768 bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3769 }
3770
3771 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3772 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3773 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3774 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3775 Label done;
3776
3777 testptr(length_in_bytes, length_in_bytes);
3778 jcc(Assembler::zero, done);
3779
3780 // initialize topmost word, divide index by 2, check if odd and test if zero
3781 // note: for the remaining code to work, index must be a multiple of BytesPerWord
3782 #ifdef ASSERT
3783 {
3784 Label L;
3785 testptr(length_in_bytes, BytesPerWord - 1);
3786 jcc(Assembler::zero, L);
3787 stop("length must be a multiple of BytesPerWord");
3788 bind(L);
3789 }
3790 #endif
3791 Register index = length_in_bytes;
3792 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
3793 if (UseIncDec) {
3794 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
3795 } else {
3796 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
3797 shrptr(index, 1);
3798 }
3799 #ifndef _LP64
3800 // index could have not been a multiple of 8 (i.e., bit 2 was set)
3801 {
3802 Label even;
3803 // note: if index was a multiple of 8, then it cannot
3804 // be 0 now otherwise it must have been 0 before
3805 // => if it is even, we don't need to check for 0 again
3806 jcc(Assembler::carryClear, even);
3807 // clear topmost word (no jump would be needed if conditional assignment worked here)
3808 movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3809 // index could be 0 now, must check again
3810 jcc(Assembler::zero, done);
3811 bind(even);
3812 }
3813 #endif // !_LP64
3814 // initialize remaining object fields: index is a multiple of 2 now
3815 {
3816 Label loop;
3817 bind(loop);
3818 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3819 NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3820 decrement(index);
3821 jcc(Assembler::notZero, loop);
3822 }
3823
3824 bind(done);
3825 }
3826
3827 // Look up the method for a megamorphic invokeinterface call.
3828 // The target method is determined by <intf_klass, itable_index>.
3829 // The receiver klass is in recv_klass.
3830 // On success, the result will be in method_result, and execution falls through.
3831 // On failure, execution transfers to the given label.
3832 void MacroAssembler::lookup_interface_method(Register recv_klass,
3833 Register intf_klass,
3834 RegisterOrConstant itable_index,
3835 Register method_result,
3836 Register scan_temp,
3837 Label& L_no_such_interface,
3838 bool return_method) {
3839 assert_different_registers(recv_klass, intf_klass, scan_temp);
3840 assert_different_registers(method_result, intf_klass, scan_temp);
3841 assert(recv_klass != method_result || !return_method,
3842 "recv_klass can be destroyed when method isn't needed");
3843
3844 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3845 "caller must use same register for non-constant itable index as for method");
3846
3847 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3848 int vtable_base = in_bytes(Klass::vtable_start_offset());
3849 int itentry_off = itableMethodEntry::method_offset_in_bytes();
3850 int scan_step = itableOffsetEntry::size() * wordSize;
3851 int vte_size = vtableEntry::size_in_bytes();
3852 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3853 assert(vte_size == wordSize, "else adjust times_vte_scale");
3854
3855 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3856
3857 // %%% Could store the aligned, prescaled offset in the klassoop.
3858 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3859
3860 if (return_method) {
3861 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3862 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3863 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3864 }
3865
3866 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3867 // if (scan->interface() == intf) {
3868 // result = (klass + scan->offset() + itable_index);
3869 // }
3870 // }
3871 Label search, found_method;
3872
3873 for (int peel = 1; peel >= 0; peel--) {
3874 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3875 cmpptr(intf_klass, method_result);
3876
3877 if (peel) {
3878 jccb(Assembler::equal, found_method);
3879 } else {
3880 jccb(Assembler::notEqual, search);
3881 // (invert the test to fall through to found_method...)
3882 }
3883
3884 if (!peel) break;
3885
3886 bind(search);
3887
3888 // Check that the previous entry is non-null. A null entry means that
3889 // the receiver class doesn't implement the interface, and wasn't the
3890 // same as when the caller was compiled.
3891 testptr(method_result, method_result);
3892 jcc(Assembler::zero, L_no_such_interface);
3893 addptr(scan_temp, scan_step);
3894 }
3895
3896 bind(found_method);
3897
3898 if (return_method) {
3899 // Got a hit.
3900 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3901 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3902 }
3903 }
3904
3905
3906 // virtual method calling
3907 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3908 RegisterOrConstant vtable_index,
3909 Register method_result) {
3910 const int base = in_bytes(Klass::vtable_start_offset());
3911 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3912 Address vtable_entry_addr(recv_klass,
3913 vtable_index, Address::times_ptr,
3914 base + vtableEntry::method_offset_in_bytes());
3915 movptr(method_result, vtable_entry_addr);
3916 }
3917
3918
3919 void MacroAssembler::check_klass_subtype(Register sub_klass,
3920 Register super_klass,
3921 Register temp_reg,
3922 Label& L_success) {
3923 Label L_failure;
3924 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
3925 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3926 bind(L_failure);
3927 }
3928
3929
3930 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3931 Register super_klass,
3932 Register temp_reg,
3933 Label* L_success,
3934 Label* L_failure,
3935 Label* L_slow_path,
3936 RegisterOrConstant super_check_offset) {
3937 assert_different_registers(sub_klass, super_klass, temp_reg);
3938 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3939 if (super_check_offset.is_register()) {
3940 assert_different_registers(sub_klass, super_klass,
3941 super_check_offset.as_register());
3942 } else if (must_load_sco) {
3943 assert(temp_reg != noreg, "supply either a temp or a register offset");
3944 }
3945
3946 Label L_fallthrough;
3947 int label_nulls = 0;
3948 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
3949 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
3950 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3951 assert(label_nulls <= 1, "at most one NULL in the batch");
3952
3953 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3954 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3955 Address super_check_offset_addr(super_klass, sco_offset);
3956
3957 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3958 // range of a jccb. If this routine grows larger, reconsider at
3959 // least some of these.
3960 #define local_jcc(assembler_cond, label) \
3961 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
3962 else jcc( assembler_cond, label) /*omit semi*/
3963
3964 // Hacked jmp, which may only be used just before L_fallthrough.
3965 #define final_jmp(label) \
3966 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
3967 else jmp(label) /*omit semi*/
3968
3969 // If the pointers are equal, we are done (e.g., String[] elements).
3970 // This self-check enables sharing of secondary supertype arrays among
3971 // non-primary types such as array-of-interface. Otherwise, each such
3972 // type would need its own customized SSA.
3973 // We move this check to the front of the fast path because many
3974 // type checks are in fact trivially successful in this manner,
3975 // so we get a nicely predicted branch right at the start of the check.
3976 cmpptr(sub_klass, super_klass);
3977 local_jcc(Assembler::equal, *L_success);
3978
3979 // Check the supertype display:
3980 if (must_load_sco) {
3981 // Positive movl does right thing on LP64.
3982 movl(temp_reg, super_check_offset_addr);
3983 super_check_offset = RegisterOrConstant(temp_reg);
3984 }
3985 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3986 cmpptr(super_klass, super_check_addr); // load displayed supertype
3987
3988 // This check has worked decisively for primary supers.
3989 // Secondary supers are sought in the super_cache ('super_cache_addr').
3990 // (Secondary supers are interfaces and very deeply nested subtypes.)
3991 // This works in the same check above because of a tricky aliasing
3992 // between the super_cache and the primary super display elements.
3993 // (The 'super_check_addr' can address either, as the case requires.)
3994 // Note that the cache is updated below if it does not help us find
3995 // what we need immediately.
3996 // So if it was a primary super, we can just fail immediately.
3997 // Otherwise, it's the slow path for us (no success at this point).
3998
3999 if (super_check_offset.is_register()) {
4000 local_jcc(Assembler::equal, *L_success);
4001 cmpl(super_check_offset.as_register(), sc_offset);
4002 if (L_failure == &L_fallthrough) {
4003 local_jcc(Assembler::equal, *L_slow_path);
4004 } else {
4005 local_jcc(Assembler::notEqual, *L_failure);
4006 final_jmp(*L_slow_path);
4007 }
4008 } else if (super_check_offset.as_constant() == sc_offset) {
4009 // Need a slow path; fast failure is impossible.
4010 if (L_slow_path == &L_fallthrough) {
4011 local_jcc(Assembler::equal, *L_success);
4012 } else {
4013 local_jcc(Assembler::notEqual, *L_slow_path);
4014 final_jmp(*L_success);
4015 }
4016 } else {
4017 // No slow path; it's a fast decision.
4018 if (L_failure == &L_fallthrough) {
4019 local_jcc(Assembler::equal, *L_success);
4020 } else {
4021 local_jcc(Assembler::notEqual, *L_failure);
4022 final_jmp(*L_success);
4023 }
4024 }
4025
4026 bind(L_fallthrough);
4027
4028 #undef local_jcc
4029 #undef final_jmp
4030 }
4031
4032
4033 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4034 Register super_klass,
4035 Register temp_reg,
4036 Register temp2_reg,
4037 Label* L_success,
4038 Label* L_failure,
4039 bool set_cond_codes) {
4040 assert_different_registers(sub_klass, super_klass, temp_reg);
4041 if (temp2_reg != noreg)
4042 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4043 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4044
4045 Label L_fallthrough;
4046 int label_nulls = 0;
4047 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
4048 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
4049 assert(label_nulls <= 1, "at most one NULL in the batch");
4050
4051 // a couple of useful fields in sub_klass:
4052 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4053 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4054 Address secondary_supers_addr(sub_klass, ss_offset);
4055 Address super_cache_addr( sub_klass, sc_offset);
4056
4057 // Do a linear scan of the secondary super-klass chain.
4058 // This code is rarely used, so simplicity is a virtue here.
4059 // The repne_scan instruction uses fixed registers, which we must spill.
4060 // Don't worry too much about pre-existing connections with the input regs.
4061
4062 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4063 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4064
4065 // Get super_klass value into rax (even if it was in rdi or rcx).
4066 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4067 if (super_klass != rax) {
4068 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4069 mov(rax, super_klass);
4070 }
4071 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4072 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4073
4074 #ifndef PRODUCT
4075 int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4076 ExternalAddress pst_counter_addr((address) pst_counter);
4077 NOT_LP64( incrementl(pst_counter_addr) );
4078 LP64_ONLY( lea(rcx, pst_counter_addr) );
4079 LP64_ONLY( incrementl(Address(rcx, 0)) );
4080 #endif //PRODUCT
4081
4082 // We will consult the secondary-super array.
4083 movptr(rdi, secondary_supers_addr);
4084 // Load the array length. (Positive movl does right thing on LP64.)
4085 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4086 // Skip to start of data.
4087 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4088
4089 // Scan RCX words at [RDI] for an occurrence of RAX.
4090 // Set NZ/Z based on last compare.
4091 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4092 // not change flags (only scas instruction which is repeated sets flags).
4093 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4094
4095 testptr(rax,rax); // Set Z = 0
4096 repne_scan();
4097
4098 // Unspill the temp. registers:
4099 if (pushed_rdi) pop(rdi);
4100 if (pushed_rcx) pop(rcx);
4101 if (pushed_rax) pop(rax);
4102
4103 if (set_cond_codes) {
4104 // Special hack for the AD files: rdi is guaranteed non-zero.
4105 assert(!pushed_rdi, "rdi must be left non-NULL");
4106 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4107 }
4108
4109 if (L_failure == &L_fallthrough)
4110 jccb(Assembler::notEqual, *L_failure);
4111 else jcc(Assembler::notEqual, *L_failure);
4112
4113 // Success. Cache the super we found and proceed in triumph.
4114 movptr(super_cache_addr, super_klass);
4115
4116 if (L_success != &L_fallthrough) {
4117 jmp(*L_success);
4118 }
4119
4120 #undef IS_A_TEMP
4121
4122 bind(L_fallthrough);
4123 }
4124
4125 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4126 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4127
4128 Label L_fallthrough;
4129 if (L_fast_path == NULL) {
4130 L_fast_path = &L_fallthrough;
4131 } else if (L_slow_path == NULL) {
4132 L_slow_path = &L_fallthrough;
4133 }
4134
4135 // Fast path check: class is fully initialized
4136 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4137 jcc(Assembler::equal, *L_fast_path);
4138
4139 // Fast path check: current thread is initializer thread
4140 cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4141 if (L_slow_path == &L_fallthrough) {
4142 jcc(Assembler::equal, *L_fast_path);
4143 bind(*L_slow_path);
4144 } else if (L_fast_path == &L_fallthrough) {
4145 jcc(Assembler::notEqual, *L_slow_path);
4146 bind(*L_fast_path);
4147 } else {
4148 Unimplemented();
4149 }
4150 }
4151
4152 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4153 if (VM_Version::supports_cmov()) {
4154 cmovl(cc, dst, src);
4155 } else {
4156 Label L;
4157 jccb(negate_condition(cc), L);
4158 movl(dst, src);
4159 bind(L);
4160 }
4161 }
4162
4163 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4164 if (VM_Version::supports_cmov()) {
4165 cmovl(cc, dst, src);
4166 } else {
4167 Label L;
4168 jccb(negate_condition(cc), L);
4169 movl(dst, src);
4170 bind(L);
4171 }
4172 }
4173
4174 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4175 if (!VerifyOops) return;
4176
4177 // Pass register number to verify_oop_subroutine
4178 const char* b = NULL;
4179 {
4180 ResourceMark rm;
4181 stringStream ss;
4182 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4183 b = code_string(ss.as_string());
4184 }
4185 BLOCK_COMMENT("verify_oop {");
4186 #ifdef _LP64
4187 push(rscratch1); // save r10, trashed by movptr()
4188 #endif
4189 push(rax); // save rax,
4190 push(reg); // pass register argument
4191 ExternalAddress buffer((address) b);
4192 // avoid using pushptr, as it modifies scratch registers
4193 // and our contract is not to modify anything
4194 movptr(rax, buffer.addr());
4195 push(rax);
4196 // call indirectly to solve generation ordering problem
4197 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4198 call(rax);
4199 // Caller pops the arguments (oop, message) and restores rax, r10
4200 BLOCK_COMMENT("} verify_oop");
4201 }
4202
4203 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4204 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4205 vpternlogd(dst, 0xFF, dst, dst, vector_len);
4206 } else {
4207 assert(UseAVX > 0, "");
4208 vpcmpeqb(dst, dst, dst, vector_len);
4209 }
4210 }
4211
4212 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4213 int extra_slot_offset) {
4214 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4215 int stackElementSize = Interpreter::stackElementSize;
4216 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4217 #ifdef ASSERT
4218 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4219 assert(offset1 - offset == stackElementSize, "correct arithmetic");
4220 #endif
4221 Register scale_reg = noreg;
4222 Address::ScaleFactor scale_factor = Address::no_scale;
4223 if (arg_slot.is_constant()) {
4224 offset += arg_slot.as_constant() * stackElementSize;
4225 } else {
4226 scale_reg = arg_slot.as_register();
4227 scale_factor = Address::times(stackElementSize);
4228 }
4229 offset += wordSize; // return PC is on stack
4230 return Address(rsp, scale_reg, scale_factor, offset);
4231 }
4232
4233 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4234 if (!VerifyOops) return;
4235
4236 // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4237 // Pass register number to verify_oop_subroutine
4238 const char* b = NULL;
4239 {
4240 ResourceMark rm;
4241 stringStream ss;
4242 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4243 b = code_string(ss.as_string());
4244 }
4245 #ifdef _LP64
4246 push(rscratch1); // save r10, trashed by movptr()
4247 #endif
4248 push(rax); // save rax,
4249 // addr may contain rsp so we will have to adjust it based on the push
4250 // we just did (and on 64 bit we do two pushes)
4251 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4252 // stores rax into addr which is backwards of what was intended.
4253 if (addr.uses(rsp)) {
4254 lea(rax, addr);
4255 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4256 } else {
4257 pushptr(addr);
4258 }
4259
4260 ExternalAddress buffer((address) b);
4261 // pass msg argument
4262 // avoid using pushptr, as it modifies scratch registers
4263 // and our contract is not to modify anything
4264 movptr(rax, buffer.addr());
4265 push(rax);
4266
4267 // call indirectly to solve generation ordering problem
4268 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4269 call(rax);
4270 // Caller pops the arguments (addr, message) and restores rax, r10.
4271 }
4272
4273 void MacroAssembler::verify_tlab() {
4274 #ifdef ASSERT
4275 if (UseTLAB && VerifyOops) {
4276 Label next, ok;
4277 Register t1 = rsi;
4278 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4279
4280 push(t1);
4281 NOT_LP64(push(thread_reg));
4282 NOT_LP64(get_thread(thread_reg));
4283
4284 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4285 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4286 jcc(Assembler::aboveEqual, next);
4287 STOP("assert(top >= start)");
4288 should_not_reach_here();
4289
4290 bind(next);
4291 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4292 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4293 jcc(Assembler::aboveEqual, ok);
4294 STOP("assert(top <= end)");
4295 should_not_reach_here();
4296
4297 bind(ok);
4298 NOT_LP64(pop(thread_reg));
4299 pop(t1);
4300 }
4301 #endif
4302 }
4303
4304 class ControlWord {
4305 public:
4306 int32_t _value;
4307
4308 int rounding_control() const { return (_value >> 10) & 3 ; }
4309 int precision_control() const { return (_value >> 8) & 3 ; }
4310 bool precision() const { return ((_value >> 5) & 1) != 0; }
4311 bool underflow() const { return ((_value >> 4) & 1) != 0; }
4312 bool overflow() const { return ((_value >> 3) & 1) != 0; }
4313 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
4314 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
4315 bool invalid() const { return ((_value >> 0) & 1) != 0; }
4316
4317 void print() const {
4318 // rounding control
4319 const char* rc;
4320 switch (rounding_control()) {
4321 case 0: rc = "round near"; break;
4322 case 1: rc = "round down"; break;
4323 case 2: rc = "round up "; break;
4324 case 3: rc = "chop "; break;
4325 default:
4326 rc = NULL; // silence compiler warnings
4327 fatal("Unknown rounding control: %d", rounding_control());
4328 };
4329 // precision control
4330 const char* pc;
4331 switch (precision_control()) {
4332 case 0: pc = "24 bits "; break;
4333 case 1: pc = "reserved"; break;
4334 case 2: pc = "53 bits "; break;
4335 case 3: pc = "64 bits "; break;
4336 default:
4337 pc = NULL; // silence compiler warnings
4338 fatal("Unknown precision control: %d", precision_control());
4339 };
4340 // flags
4341 char f[9];
4342 f[0] = ' ';
4343 f[1] = ' ';
4344 f[2] = (precision ()) ? 'P' : 'p';
4345 f[3] = (underflow ()) ? 'U' : 'u';
4346 f[4] = (overflow ()) ? 'O' : 'o';
4347 f[5] = (zero_divide ()) ? 'Z' : 'z';
4348 f[6] = (denormalized()) ? 'D' : 'd';
4349 f[7] = (invalid ()) ? 'I' : 'i';
4350 f[8] = '\x0';
4351 // output
4352 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4353 }
4354
4355 };
4356
4357 class StatusWord {
4358 public:
4359 int32_t _value;
4360
4361 bool busy() const { return ((_value >> 15) & 1) != 0; }
4362 bool C3() const { return ((_value >> 14) & 1) != 0; }
4363 bool C2() const { return ((_value >> 10) & 1) != 0; }
4364 bool C1() const { return ((_value >> 9) & 1) != 0; }
4365 bool C0() const { return ((_value >> 8) & 1) != 0; }
4366 int top() const { return (_value >> 11) & 7 ; }
4367 bool error_status() const { return ((_value >> 7) & 1) != 0; }
4368 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
4369 bool precision() const { return ((_value >> 5) & 1) != 0; }
4370 bool underflow() const { return ((_value >> 4) & 1) != 0; }
4371 bool overflow() const { return ((_value >> 3) & 1) != 0; }
4372 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
4373 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
4374 bool invalid() const { return ((_value >> 0) & 1) != 0; }
4375
4376 void print() const {
4377 // condition codes
4378 char c[5];
4379 c[0] = (C3()) ? '3' : '-';
4380 c[1] = (C2()) ? '2' : '-';
4381 c[2] = (C1()) ? '1' : '-';
4382 c[3] = (C0()) ? '0' : '-';
4383 c[4] = '\x0';
4384 // flags
4385 char f[9];
4386 f[0] = (error_status()) ? 'E' : '-';
4387 f[1] = (stack_fault ()) ? 'S' : '-';
4388 f[2] = (precision ()) ? 'P' : '-';
4389 f[3] = (underflow ()) ? 'U' : '-';
4390 f[4] = (overflow ()) ? 'O' : '-';
4391 f[5] = (zero_divide ()) ? 'Z' : '-';
4392 f[6] = (denormalized()) ? 'D' : '-';
4393 f[7] = (invalid ()) ? 'I' : '-';
4394 f[8] = '\x0';
4395 // output
4396 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
4397 }
4398
4399 };
4400
4401 class TagWord {
4402 public:
4403 int32_t _value;
4404
4405 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
4406
4407 void print() const {
4408 printf("%04x", _value & 0xFFFF);
4409 }
4410
4411 };
4412
4413 class FPU_Register {
4414 public:
4415 int32_t _m0;
4416 int32_t _m1;
4417 int16_t _ex;
4418
4419 bool is_indefinite() const {
4420 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4421 }
4422
4423 void print() const {
4424 char sign = (_ex < 0) ? '-' : '+';
4425 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
4426 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
4427 };
4428
4429 };
4430
4431 class FPU_State {
4432 public:
4433 enum {
4434 register_size = 10,
4435 number_of_registers = 8,
4436 register_mask = 7
4437 };
4438
4439 ControlWord _control_word;
4440 StatusWord _status_word;
4441 TagWord _tag_word;
4442 int32_t _error_offset;
4443 int32_t _error_selector;
4444 int32_t _data_offset;
4445 int32_t _data_selector;
4446 int8_t _register[register_size * number_of_registers];
4447
4448 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4449 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
4450
4451 const char* tag_as_string(int tag) const {
4452 switch (tag) {
4453 case 0: return "valid";
4454 case 1: return "zero";
4455 case 2: return "special";
4456 case 3: return "empty";
4457 }
4458 ShouldNotReachHere();
4459 return NULL;
4460 }
4461
4462 void print() const {
4463 // print computation registers
4464 { int t = _status_word.top();
4465 for (int i = 0; i < number_of_registers; i++) {
4466 int j = (i - t) & register_mask;
4467 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4468 st(j)->print();
4469 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4470 }
4471 }
4472 printf("\n");
4473 // print control registers
4474 printf("ctrl = "); _control_word.print(); printf("\n");
4475 printf("stat = "); _status_word .print(); printf("\n");
4476 printf("tags = "); _tag_word .print(); printf("\n");
4477 }
4478
4479 };
4480
4481 class Flag_Register {
4482 public:
4483 int32_t _value;
4484
4485 bool overflow() const { return ((_value >> 11) & 1) != 0; }
4486 bool direction() const { return ((_value >> 10) & 1) != 0; }
4487 bool sign() const { return ((_value >> 7) & 1) != 0; }
4488 bool zero() const { return ((_value >> 6) & 1) != 0; }
4489 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
4490 bool parity() const { return ((_value >> 2) & 1) != 0; }
4491 bool carry() const { return ((_value >> 0) & 1) != 0; }
4492
4493 void print() const {
4494 // flags
4495 char f[8];
4496 f[0] = (overflow ()) ? 'O' : '-';
4497 f[1] = (direction ()) ? 'D' : '-';
4498 f[2] = (sign ()) ? 'S' : '-';
4499 f[3] = (zero ()) ? 'Z' : '-';
4500 f[4] = (auxiliary_carry()) ? 'A' : '-';
4501 f[5] = (parity ()) ? 'P' : '-';
4502 f[6] = (carry ()) ? 'C' : '-';
4503 f[7] = '\x0';
4504 // output
4505 printf("%08x flags = %s", _value, f);
4506 }
4507
4508 };
4509
4510 class IU_Register {
4511 public:
4512 int32_t _value;
4513
4514 void print() const {
4515 printf("%08x %11d", _value, _value);
4516 }
4517
4518 };
4519
4520 class IU_State {
4521 public:
4522 Flag_Register _eflags;
4523 IU_Register _rdi;
4524 IU_Register _rsi;
4525 IU_Register _rbp;
4526 IU_Register _rsp;
4527 IU_Register _rbx;
4528 IU_Register _rdx;
4529 IU_Register _rcx;
4530 IU_Register _rax;
4531
4532 void print() const {
4533 // computation registers
4534 printf("rax, = "); _rax.print(); printf("\n");
4535 printf("rbx, = "); _rbx.print(); printf("\n");
4536 printf("rcx = "); _rcx.print(); printf("\n");
4537 printf("rdx = "); _rdx.print(); printf("\n");
4538 printf("rdi = "); _rdi.print(); printf("\n");
4539 printf("rsi = "); _rsi.print(); printf("\n");
4540 printf("rbp, = "); _rbp.print(); printf("\n");
4541 printf("rsp = "); _rsp.print(); printf("\n");
4542 printf("\n");
4543 // control registers
4544 printf("flgs = "); _eflags.print(); printf("\n");
4545 }
4546 };
4547
4548
4549 class CPU_State {
4550 public:
4551 FPU_State _fpu_state;
4552 IU_State _iu_state;
4553
4554 void print() const {
4555 printf("--------------------------------------------------\n");
4556 _iu_state .print();
4557 printf("\n");
4558 _fpu_state.print();
4559 printf("--------------------------------------------------\n");
4560 }
4561
4562 };
4563
4564
4565 static void _print_CPU_state(CPU_State* state) {
4566 state->print();
4567 };
4568
4569
4570 void MacroAssembler::print_CPU_state() {
4571 push_CPU_state();
4572 push(rsp); // pass CPU state
4573 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4574 addptr(rsp, wordSize); // discard argument
4575 pop_CPU_state();
4576 }
4577
4578
4579 #ifndef _LP64
4580 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4581 static int counter = 0;
4582 FPU_State* fs = &state->_fpu_state;
4583 counter++;
4584 // For leaf calls, only verify that the top few elements remain empty.
4585 // We only need 1 empty at the top for C2 code.
4586 if( stack_depth < 0 ) {
4587 if( fs->tag_for_st(7) != 3 ) {
4588 printf("FPR7 not empty\n");
4589 state->print();
4590 assert(false, "error");
4591 return false;
4592 }
4593 return true; // All other stack states do not matter
4594 }
4595
4596 assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4597 "bad FPU control word");
4598
4599 // compute stack depth
4600 int i = 0;
4601 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++;
4602 int d = i;
4603 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4604 // verify findings
4605 if (i != FPU_State::number_of_registers) {
4606 // stack not contiguous
4607 printf("%s: stack not contiguous at ST%d\n", s, i);
4608 state->print();
4609 assert(false, "error");
4610 return false;
4611 }
4612 // check if computed stack depth corresponds to expected stack depth
4613 if (stack_depth < 0) {
4614 // expected stack depth is -stack_depth or less
4615 if (d > -stack_depth) {
4616 // too many elements on the stack
4617 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4618 state->print();
4619 assert(false, "error");
4620 return false;
4621 }
4622 } else {
4623 // expected stack depth is stack_depth
4624 if (d != stack_depth) {
4625 // wrong stack depth
4626 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4627 state->print();
4628 assert(false, "error");
4629 return false;
4630 }
4631 }
4632 // everything is cool
4633 return true;
4634 }
4635
4636 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4637 if (!VerifyFPU) return;
4638 push_CPU_state();
4639 push(rsp); // pass CPU state
4640 ExternalAddress msg((address) s);
4641 // pass message string s
4642 pushptr(msg.addr());
4643 push(stack_depth); // pass stack depth
4644 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4645 addptr(rsp, 3 * wordSize); // discard arguments
4646 // check for error
4647 { Label L;
4648 testl(rax, rax);
4649 jcc(Assembler::notZero, L);
4650 int3(); // break if error condition
4651 bind(L);
4652 }
4653 pop_CPU_state();
4654 }
4655 #endif // _LP64
4656
4657 void MacroAssembler::restore_cpu_control_state_after_jni() {
4658 // Either restore the MXCSR register after returning from the JNI Call
4659 // or verify that it wasn't changed (with -Xcheck:jni flag).
4660 if (VM_Version::supports_sse()) {
4661 if (RestoreMXCSROnJNICalls) {
4662 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4663 } else if (CheckJNICalls) {
4664 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4665 }
4666 }
4667 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4668 vzeroupper();
4669 // Reset k1 to 0xffff.
4670
4671 #ifdef COMPILER2
4672 if (PostLoopMultiversioning && VM_Version::supports_evex()) {
4673 push(rcx);
4674 movl(rcx, 0xffff);
4675 kmovwl(k1, rcx);
4676 pop(rcx);
4677 }
4678 #endif // COMPILER2
4679
4680 #ifndef _LP64
4681 // Either restore the x87 floating pointer control word after returning
4682 // from the JNI call or verify that it wasn't changed.
4683 if (CheckJNICalls) {
4684 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4685 }
4686 #endif // _LP64
4687 }
4688
4689 // ((OopHandle)result).resolve();
4690 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4691 assert_different_registers(result, tmp);
4692
4693 // Only 64 bit platforms support GCs that require a tmp register
4694 // Only IN_HEAP loads require a thread_tmp register
4695 // OopHandle::resolve is an indirection like jobject.
4696 access_load_at(T_OBJECT, IN_NATIVE,
4697 result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4698 }
4699
4700 // ((WeakHandle)result).resolve();
4701 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4702 assert_different_registers(rresult, rtmp);
4703 Label resolved;
4704
4705 // A null weak handle resolves to null.
4706 cmpptr(rresult, 0);
4707 jcc(Assembler::equal, resolved);
4708
4709 // Only 64 bit platforms support GCs that require a tmp register
4710 // Only IN_HEAP loads require a thread_tmp register
4711 // WeakHandle::resolve is an indirection like jweak.
4712 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4713 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4714 bind(resolved);
4715 }
4716
4717 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4718 // get mirror
4719 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4720 load_method_holder(mirror, method);
4721 movptr(mirror, Address(mirror, mirror_offset));
4722 resolve_oop_handle(mirror, tmp);
4723 }
4724
4725 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4726 load_method_holder(rresult, rmethod);
4727 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4728 }
4729
4730 void MacroAssembler::load_method_holder(Register holder, Register method) {
4731 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
4732 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
4733 movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4734 }
4735
4736 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
4737 assert_different_registers(src, tmp);
4738 assert_different_registers(dst, tmp);
4739 #ifdef _LP64
4740 if (UseCompressedClassPointers) {
4741 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4742 decode_klass_not_null(dst, tmp);
4743 } else
4744 #endif
4745 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4746 }
4747
4748 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
4749 load_klass(dst, src, tmp);
4750 movptr(dst, Address(dst, Klass::prototype_header_offset()));
4751 }
4752
4753 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4754 assert_different_registers(src, tmp);
4755 assert_different_registers(dst, tmp);
4756 #ifdef _LP64
4757 if (UseCompressedClassPointers) {
4758 encode_klass_not_null(src, tmp);
4759 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4760 } else
4761 #endif
4762 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4763 }
4764
4765 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4766 Register tmp1, Register thread_tmp) {
4767 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4768 decorators = AccessInternal::decorator_fixup(decorators);
4769 bool as_raw = (decorators & AS_RAW) != 0;
4770 if (as_raw) {
4771 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4772 } else {
4773 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4774 }
4775 }
4776
4777 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4778 Register tmp1, Register tmp2) {
4779 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4780 decorators = AccessInternal::decorator_fixup(decorators);
4781 bool as_raw = (decorators & AS_RAW) != 0;
4782 if (as_raw) {
4783 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
4784 } else {
4785 bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
4786 }
4787 }
4788
4789 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4790 Register thread_tmp, DecoratorSet decorators) {
4791 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4792 }
4793
4794 // Doesn't do verfication, generates fixed size code
4795 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4796 Register thread_tmp, DecoratorSet decorators) {
4797 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4798 }
4799
4800 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4801 Register tmp2, DecoratorSet decorators) {
4802 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4803 }
4804
4805 // Used for storing NULLs.
4806 void MacroAssembler::store_heap_oop_null(Address dst) {
4807 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4808 }
4809
4810 #ifdef _LP64
4811 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4812 if (UseCompressedClassPointers) {
4813 // Store to klass gap in destination
4814 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4815 }
4816 }
4817
4818 #ifdef ASSERT
4819 void MacroAssembler::verify_heapbase(const char* msg) {
4820 assert (UseCompressedOops, "should be compressed");
4821 assert (Universe::heap() != NULL, "java heap should be initialized");
4822 if (CheckCompressedOops) {
4823 Label ok;
4824 push(rscratch1); // cmpptr trashes rscratch1
4825 cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4826 jcc(Assembler::equal, ok);
4827 STOP(msg);
4828 bind(ok);
4829 pop(rscratch1);
4830 }
4831 }
4832 #endif
4833
4834 // Algorithm must match oop.inline.hpp encode_heap_oop.
4835 void MacroAssembler::encode_heap_oop(Register r) {
4836 #ifdef ASSERT
4837 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4838 #endif
4839 verify_oop_msg(r, "broken oop in encode_heap_oop");
4840 if (CompressedOops::base() == NULL) {
4841 if (CompressedOops::shift() != 0) {
4842 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4843 shrq(r, LogMinObjAlignmentInBytes);
4844 }
4845 return;
4846 }
4847 testq(r, r);
4848 cmovq(Assembler::equal, r, r12_heapbase);
4849 subq(r, r12_heapbase);
4850 shrq(r, LogMinObjAlignmentInBytes);
4851 }
4852
4853 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4854 #ifdef ASSERT
4855 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4856 if (CheckCompressedOops) {
4857 Label ok;
4858 testq(r, r);
4859 jcc(Assembler::notEqual, ok);
4860 STOP("null oop passed to encode_heap_oop_not_null");
4861 bind(ok);
4862 }
4863 #endif
4864 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4865 if (CompressedOops::base() != NULL) {
4866 subq(r, r12_heapbase);
4867 }
4868 if (CompressedOops::shift() != 0) {
4869 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4870 shrq(r, LogMinObjAlignmentInBytes);
4871 }
4872 }
4873
4874 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4875 #ifdef ASSERT
4876 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4877 if (CheckCompressedOops) {
4878 Label ok;
4879 testq(src, src);
4880 jcc(Assembler::notEqual, ok);
4881 STOP("null oop passed to encode_heap_oop_not_null2");
4882 bind(ok);
4883 }
4884 #endif
4885 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4886 if (dst != src) {
4887 movq(dst, src);
4888 }
4889 if (CompressedOops::base() != NULL) {
4890 subq(dst, r12_heapbase);
4891 }
4892 if (CompressedOops::shift() != 0) {
4893 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4894 shrq(dst, LogMinObjAlignmentInBytes);
4895 }
4896 }
4897
4898 void MacroAssembler::decode_heap_oop(Register r) {
4899 #ifdef ASSERT
4900 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4901 #endif
4902 if (CompressedOops::base() == NULL) {
4903 if (CompressedOops::shift() != 0) {
4904 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4905 shlq(r, LogMinObjAlignmentInBytes);
4906 }
4907 } else {
4908 Label done;
4909 shlq(r, LogMinObjAlignmentInBytes);
4910 jccb(Assembler::equal, done);
4911 addq(r, r12_heapbase);
4912 bind(done);
4913 }
4914 verify_oop_msg(r, "broken oop in decode_heap_oop");
4915 }
4916
4917 void MacroAssembler::decode_heap_oop_not_null(Register r) {
4918 // Note: it will change flags
4919 assert (UseCompressedOops, "should only be used for compressed headers");
4920 assert (Universe::heap() != NULL, "java heap should be initialized");
4921 // Cannot assert, unverified entry point counts instructions (see .ad file)
4922 // vtableStubs also counts instructions in pd_code_size_limit.
4923 // Also do not verify_oop as this is called by verify_oop.
4924 if (CompressedOops::shift() != 0) {
4925 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4926 shlq(r, LogMinObjAlignmentInBytes);
4927 if (CompressedOops::base() != NULL) {
4928 addq(r, r12_heapbase);
4929 }
4930 } else {
4931 assert (CompressedOops::base() == NULL, "sanity");
4932 }
4933 }
4934
4935 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4936 // Note: it will change flags
4937 assert (UseCompressedOops, "should only be used for compressed headers");
4938 assert (Universe::heap() != NULL, "java heap should be initialized");
4939 // Cannot assert, unverified entry point counts instructions (see .ad file)
4940 // vtableStubs also counts instructions in pd_code_size_limit.
4941 // Also do not verify_oop as this is called by verify_oop.
4942 if (CompressedOops::shift() != 0) {
4943 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4944 if (LogMinObjAlignmentInBytes == Address::times_8) {
4945 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
4946 } else {
4947 if (dst != src) {
4948 movq(dst, src);
4949 }
4950 shlq(dst, LogMinObjAlignmentInBytes);
4951 if (CompressedOops::base() != NULL) {
4952 addq(dst, r12_heapbase);
4953 }
4954 }
4955 } else {
4956 assert (CompressedOops::base() == NULL, "sanity");
4957 if (dst != src) {
4958 movq(dst, src);
4959 }
4960 }
4961 }
4962
4963 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
4964 assert_different_registers(r, tmp);
4965 if (CompressedKlassPointers::base() != NULL) {
4966 mov64(tmp, (int64_t)CompressedKlassPointers::base());
4967 subq(r, tmp);
4968 }
4969 if (CompressedKlassPointers::shift() != 0) {
4970 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4971 shrq(r, LogKlassAlignmentInBytes);
4972 }
4973 }
4974
4975 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
4976 assert_different_registers(src, dst);
4977 if (CompressedKlassPointers::base() != NULL) {
4978 mov64(dst, -(int64_t)CompressedKlassPointers::base());
4979 addq(dst, src);
4980 } else {
4981 movptr(dst, src);
4982 }
4983 if (CompressedKlassPointers::shift() != 0) {
4984 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4985 shrq(dst, LogKlassAlignmentInBytes);
4986 }
4987 }
4988
4989 // !!! If the instructions that get generated here change then function
4990 // instr_size_for_decode_klass_not_null() needs to get updated.
4991 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
4992 assert_different_registers(r, tmp);
4993 // Note: it will change flags
4994 assert(UseCompressedClassPointers, "should only be used for compressed headers");
4995 // Cannot assert, unverified entry point counts instructions (see .ad file)
4996 // vtableStubs also counts instructions in pd_code_size_limit.
4997 // Also do not verify_oop as this is called by verify_oop.
4998 if (CompressedKlassPointers::shift() != 0) {
4999 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5000 shlq(r, LogKlassAlignmentInBytes);
5001 }
5002 if (CompressedKlassPointers::base() != NULL) {
5003 mov64(tmp, (int64_t)CompressedKlassPointers::base());
5004 addq(r, tmp);
5005 }
5006 }
5007
5008 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5009 assert_different_registers(src, dst);
5010 // Note: it will change flags
5011 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5012 // Cannot assert, unverified entry point counts instructions (see .ad file)
5013 // vtableStubs also counts instructions in pd_code_size_limit.
5014 // Also do not verify_oop as this is called by verify_oop.
5015
5016 if (CompressedKlassPointers::base() == NULL &&
5017 CompressedKlassPointers::shift() == 0) {
5018 // The best case scenario is that there is no base or shift. Then it is already
5019 // a pointer that needs nothing but a register rename.
5020 movl(dst, src);
5021 } else {
5022 if (CompressedKlassPointers::base() != NULL) {
5023 mov64(dst, (int64_t)CompressedKlassPointers::base());
5024 } else {
5025 xorq(dst, dst);
5026 }
5027 if (CompressedKlassPointers::shift() != 0) {
5028 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5029 assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5030 leaq(dst, Address(dst, src, Address::times_8, 0));
5031 } else {
5032 addq(dst, src);
5033 }
5034 }
5035 }
5036
5037 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5038 assert (UseCompressedOops, "should only be used for compressed headers");
5039 assert (Universe::heap() != NULL, "java heap should be initialized");
5040 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5041 int oop_index = oop_recorder()->find_index(obj);
5042 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5043 mov_narrow_oop(dst, oop_index, rspec);
5044 }
5045
5046 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5047 assert (UseCompressedOops, "should only be used for compressed headers");
5048 assert (Universe::heap() != NULL, "java heap should be initialized");
5049 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5050 int oop_index = oop_recorder()->find_index(obj);
5051 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5052 mov_narrow_oop(dst, oop_index, rspec);
5053 }
5054
5055 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5056 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5057 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5058 int klass_index = oop_recorder()->find_index(k);
5059 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5060 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5061 }
5062
5063 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5064 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5065 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5066 int klass_index = oop_recorder()->find_index(k);
5067 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5068 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5069 }
5070
5071 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5072 assert (UseCompressedOops, "should only be used for compressed headers");
5073 assert (Universe::heap() != NULL, "java heap should be initialized");
5074 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5075 int oop_index = oop_recorder()->find_index(obj);
5076 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5077 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5078 }
5079
5080 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5081 assert (UseCompressedOops, "should only be used for compressed headers");
5082 assert (Universe::heap() != NULL, "java heap should be initialized");
5083 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5084 int oop_index = oop_recorder()->find_index(obj);
5085 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5086 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5087 }
5088
5089 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5090 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5091 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5092 int klass_index = oop_recorder()->find_index(k);
5093 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5094 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5095 }
5096
5097 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5098 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5099 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5100 int klass_index = oop_recorder()->find_index(k);
5101 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5102 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5103 }
5104
5105 void MacroAssembler::reinit_heapbase() {
5106 if (UseCompressedOops) {
5107 if (Universe::heap() != NULL) {
5108 if (CompressedOops::base() == NULL) {
5109 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5110 } else {
5111 mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5112 }
5113 } else {
5114 movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5115 }
5116 }
5117 }
5118
5119 #endif // _LP64
5120
5121 // C2 compiled method's prolog code.
5122 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5123
5124 // WARNING: Initial instruction MUST be 5 bytes or longer so that
5125 // NativeJump::patch_verified_entry will be able to patch out the entry
5126 // code safely. The push to verify stack depth is ok at 5 bytes,
5127 // the frame allocation can be either 3 or 6 bytes. So if we don't do
5128 // stack bang then we must use the 6 byte frame allocation even if
5129 // we have no frame. :-(
5130 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5131
5132 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5133 // Remove word for return addr
5134 framesize -= wordSize;
5135 stack_bang_size -= wordSize;
5136
5137 // Calls to C2R adapters often do not accept exceptional returns.
5138 // We require that their callers must bang for them. But be careful, because
5139 // some VM calls (such as call site linkage) can use several kilobytes of
5140 // stack. But the stack safety zone should account for that.
5141 // See bugs 4446381, 4468289, 4497237.
5142 if (stack_bang_size > 0) {
5143 generate_stack_overflow_check(stack_bang_size);
5144
5145 // We always push rbp, so that on return to interpreter rbp, will be
5146 // restored correctly and we can correct the stack.
5147 push(rbp);
5148 // Save caller's stack pointer into RBP if the frame pointer is preserved.
5149 if (PreserveFramePointer) {
5150 mov(rbp, rsp);
5151 }
5152 // Remove word for ebp
5153 framesize -= wordSize;
5154
5155 // Create frame
5156 if (framesize) {
5157 subptr(rsp, framesize);
5158 }
5159 } else {
5160 // Create frame (force generation of a 4 byte immediate value)
5161 subptr_imm32(rsp, framesize);
5162
5163 // Save RBP register now.
5164 framesize -= wordSize;
5165 movptr(Address(rsp, framesize), rbp);
5166 // Save caller's stack pointer into RBP if the frame pointer is preserved.
5167 if (PreserveFramePointer) {
5168 movptr(rbp, rsp);
5169 if (framesize > 0) {
5170 addptr(rbp, framesize);
5171 }
5172 }
5173 }
5174
5175 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5176 framesize -= wordSize;
5177 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5178 }
5179
5180 #ifndef _LP64
5181 // If method sets FPU control word do it now
5182 if (fp_mode_24b) {
5183 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
5184 }
5185 if (UseSSE >= 2 && VerifyFPU) {
5186 verify_FPU(0, "FPU stack must be clean on entry");
5187 }
5188 #endif
5189
5190 #ifdef ASSERT
5191 if (VerifyStackAtCalls) {
5192 Label L;
5193 push(rax);
5194 mov(rax, rsp);
5195 andptr(rax, StackAlignmentInBytes-1);
5196 cmpptr(rax, StackAlignmentInBytes-wordSize);
5197 pop(rax);
5198 jcc(Assembler::equal, L);
5199 STOP("Stack is not properly aligned!");
5200 bind(L);
5201 }
5202 #endif
5203
5204 if (!is_stub) {
5205 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5206 bs->nmethod_entry_barrier(this);
5207 }
5208 }
5209
5210 #if COMPILER2_OR_JVMCI
5211
5212 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5213 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5214 // cnt - number of qwords (8-byte words).
5215 // base - start address, qword aligned.
5216 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5217 bool use64byteVector = MaxVectorSize == 64 && AVX3Threshold == 0;
5218 if (use64byteVector) {
5219 vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5220 } else if (MaxVectorSize >= 32) {
5221 vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5222 } else {
5223 pxor(xtmp, xtmp);
5224 }
5225 jmp(L_zero_64_bytes);
5226
5227 BIND(L_loop);
5228 if (MaxVectorSize >= 32) {
5229 fill64_avx(base, 0, xtmp, use64byteVector);
5230 } else {
5231 movdqu(Address(base, 0), xtmp);
5232 movdqu(Address(base, 16), xtmp);
5233 movdqu(Address(base, 32), xtmp);
5234 movdqu(Address(base, 48), xtmp);
5235 }
5236 addptr(base, 64);
5237
5238 BIND(L_zero_64_bytes);
5239 subptr(cnt, 8);
5240 jccb(Assembler::greaterEqual, L_loop);
5241
5242 // Copy trailing 64 bytes
5243 if (use64byteVector) {
5244 addptr(cnt, 8);
5245 jccb(Assembler::equal, L_end);
5246 fill64_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp, true);
5247 jmp(L_end);
5248 } else {
5249 addptr(cnt, 4);
5250 jccb(Assembler::less, L_tail);
5251 if (MaxVectorSize >= 32) {
5252 vmovdqu(Address(base, 0), xtmp);
5253 } else {
5254 movdqu(Address(base, 0), xtmp);
5255 movdqu(Address(base, 16), xtmp);
5256 }
5257 }
5258 addptr(base, 32);
5259 subptr(cnt, 4);
5260
5261 BIND(L_tail);
5262 addptr(cnt, 4);
5263 jccb(Assembler::lessEqual, L_end);
5264 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5265 fill32_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp);
5266 } else {
5267 decrement(cnt);
5268
5269 BIND(L_sloop);
5270 movq(Address(base, 0), xtmp);
5271 addptr(base, 8);
5272 decrement(cnt);
5273 jccb(Assembler::greaterEqual, L_sloop);
5274 }
5275 BIND(L_end);
5276 }
5277
5278 // Clearing constant sized memory using YMM/ZMM registers.
5279 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5280 assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5281 bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
5282
5283 int vector64_count = (cnt & (~0x7)) >> 3;
5284 cnt = cnt & 0x7;
5285 const int fill64_per_loop = 4;
5286 const int max_unrolled_fill64 = 8;
5287
5288 // 64 byte initialization loop.
5289 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5290 int start64 = 0;
5291 if (vector64_count > max_unrolled_fill64) {
5292 Label LOOP;
5293 Register index = rtmp;
5294
5295 start64 = vector64_count - (vector64_count % fill64_per_loop);
5296
5297 movl(index, 0);
5298 BIND(LOOP);
5299 for (int i = 0; i < fill64_per_loop; i++) {
5300 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5301 }
5302 addl(index, fill64_per_loop * 64);
5303 cmpl(index, start64 * 64);
5304 jccb(Assembler::less, LOOP);
5305 }
5306 for (int i = start64; i < vector64_count; i++) {
5307 fill64_avx(base, i * 64, xtmp, use64byteVector);
5308 }
5309
5310 // Clear remaining 64 byte tail.
5311 int disp = vector64_count * 64;
5312 if (cnt) {
5313 switch (cnt) {
5314 case 1:
5315 movq(Address(base, disp), xtmp);
5316 break;
5317 case 2:
5318 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit);
5319 break;
5320 case 3:
5321 movl(rtmp, 0x7);
5322 kmovwl(mask, rtmp);
5323 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit);
5324 break;
5325 case 4:
5326 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5327 break;
5328 case 5:
5329 if (use64byteVector) {
5330 movl(rtmp, 0x1F);
5331 kmovwl(mask, rtmp);
5332 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5333 } else {
5334 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5335 movq(Address(base, disp + 32), xtmp);
5336 }
5337 break;
5338 case 6:
5339 if (use64byteVector) {
5340 movl(rtmp, 0x3F);
5341 kmovwl(mask, rtmp);
5342 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5343 } else {
5344 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5345 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit);
5346 }
5347 break;
5348 case 7:
5349 if (use64byteVector) {
5350 movl(rtmp, 0x7F);
5351 kmovwl(mask, rtmp);
5352 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5353 } else {
5354 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5355 movl(rtmp, 0x7);
5356 kmovwl(mask, rtmp);
5357 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit);
5358 }
5359 break;
5360 default:
5361 fatal("Unexpected length : %d\n",cnt);
5362 break;
5363 }
5364 }
5365 }
5366
5367 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5368 bool is_large, KRegister mask) {
5369 // cnt - number of qwords (8-byte words).
5370 // base - start address, qword aligned.
5371 // is_large - if optimizers know cnt is larger than InitArrayShortSize
5372 assert(base==rdi, "base register must be edi for rep stos");
5373 assert(tmp==rax, "tmp register must be eax for rep stos");
5374 assert(cnt==rcx, "cnt register must be ecx for rep stos");
5375 assert(InitArrayShortSize % BytesPerLong == 0,
5376 "InitArrayShortSize should be the multiple of BytesPerLong");
5377
5378 Label DONE;
5379 if (!is_large || !UseXMMForObjInit) {
5380 xorptr(tmp, tmp);
5381 }
5382
5383 if (!is_large) {
5384 Label LOOP, LONG;
5385 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5386 jccb(Assembler::greater, LONG);
5387
5388 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5389
5390 decrement(cnt);
5391 jccb(Assembler::negative, DONE); // Zero length
5392
5393 // Use individual pointer-sized stores for small counts:
5394 BIND(LOOP);
5395 movptr(Address(base, cnt, Address::times_ptr), tmp);
5396 decrement(cnt);
5397 jccb(Assembler::greaterEqual, LOOP);
5398 jmpb(DONE);
5399
5400 BIND(LONG);
5401 }
5402
5403 // Use longer rep-prefixed ops for non-small counts:
5404 if (UseFastStosb) {
5405 shlptr(cnt, 3); // convert to number of bytes
5406 rep_stosb();
5407 } else if (UseXMMForObjInit) {
5408 xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5409 } else {
5410 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5411 rep_stos();
5412 }
5413
5414 BIND(DONE);
5415 }
5416
5417 #endif //COMPILER2_OR_JVMCI
5418
5419
5420 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5421 Register to, Register value, Register count,
5422 Register rtmp, XMMRegister xtmp) {
5423 ShortBranchVerifier sbv(this);
5424 assert_different_registers(to, value, count, rtmp);
5425 Label L_exit;
5426 Label L_fill_2_bytes, L_fill_4_bytes;
5427
5428 int shift = -1;
5429 switch (t) {
5430 case T_BYTE:
5431 shift = 2;
5432 break;
5433 case T_SHORT:
5434 shift = 1;
5435 break;
5436 case T_INT:
5437 shift = 0;
5438 break;
5439 default: ShouldNotReachHere();
5440 }
5441
5442 if (t == T_BYTE) {
5443 andl(value, 0xff);
5444 movl(rtmp, value);
5445 shll(rtmp, 8);
5446 orl(value, rtmp);
5447 }
5448 if (t == T_SHORT) {
5449 andl(value, 0xffff);
5450 }
5451 if (t == T_BYTE || t == T_SHORT) {
5452 movl(rtmp, value);
5453 shll(rtmp, 16);
5454 orl(value, rtmp);
5455 }
5456
5457 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5458 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5459 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5460 Label L_skip_align2;
5461 // align source address at 4 bytes address boundary
5462 if (t == T_BYTE) {
5463 Label L_skip_align1;
5464 // One byte misalignment happens only for byte arrays
5465 testptr(to, 1);
5466 jccb(Assembler::zero, L_skip_align1);
5467 movb(Address(to, 0), value);
5468 increment(to);
5469 decrement(count);
5470 BIND(L_skip_align1);
5471 }
5472 // Two bytes misalignment happens only for byte and short (char) arrays
5473 testptr(to, 2);
5474 jccb(Assembler::zero, L_skip_align2);
5475 movw(Address(to, 0), value);
5476 addptr(to, 2);
5477 subl(count, 1<<(shift-1));
5478 BIND(L_skip_align2);
5479 }
5480 if (UseSSE < 2) {
5481 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5482 // Fill 32-byte chunks
5483 subl(count, 8 << shift);
5484 jcc(Assembler::less, L_check_fill_8_bytes);
5485 align(16);
5486
5487 BIND(L_fill_32_bytes_loop);
5488
5489 for (int i = 0; i < 32; i += 4) {
5490 movl(Address(to, i), value);
5491 }
5492
5493 addptr(to, 32);
5494 subl(count, 8 << shift);
5495 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5496 BIND(L_check_fill_8_bytes);
5497 addl(count, 8 << shift);
5498 jccb(Assembler::zero, L_exit);
5499 jmpb(L_fill_8_bytes);
5500
5501 //
5502 // length is too short, just fill qwords
5503 //
5504 BIND(L_fill_8_bytes_loop);
5505 movl(Address(to, 0), value);
5506 movl(Address(to, 4), value);
5507 addptr(to, 8);
5508 BIND(L_fill_8_bytes);
5509 subl(count, 1 << (shift + 1));
5510 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5511 // fall through to fill 4 bytes
5512 } else {
5513 Label L_fill_32_bytes;
5514 if (!UseUnalignedLoadStores) {
5515 // align to 8 bytes, we know we are 4 byte aligned to start
5516 testptr(to, 4);
5517 jccb(Assembler::zero, L_fill_32_bytes);
5518 movl(Address(to, 0), value);
5519 addptr(to, 4);
5520 subl(count, 1<<shift);
5521 }
5522 BIND(L_fill_32_bytes);
5523 {
5524 assert( UseSSE >= 2, "supported cpu only" );
5525 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5526 movdl(xtmp, value);
5527 if (UseAVX >= 2 && UseUnalignedLoadStores) {
5528 Label L_check_fill_32_bytes;
5529 if (UseAVX > 2) {
5530 // Fill 64-byte chunks
5531 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5532
5533 // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
5534 cmpl(count, AVX3Threshold);
5535 jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5536
5537 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5538
5539 subl(count, 16 << shift);
5540 jccb(Assembler::less, L_check_fill_32_bytes);
5541 align(16);
5542
5543 BIND(L_fill_64_bytes_loop_avx3);
5544 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5545 addptr(to, 64);
5546 subl(count, 16 << shift);
5547 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5548 jmpb(L_check_fill_32_bytes);
5549
5550 BIND(L_check_fill_64_bytes_avx2);
5551 }
5552 // Fill 64-byte chunks
5553 Label L_fill_64_bytes_loop;
5554 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5555
5556 subl(count, 16 << shift);
5557 jcc(Assembler::less, L_check_fill_32_bytes);
5558 align(16);
5559
5560 BIND(L_fill_64_bytes_loop);
5561 vmovdqu(Address(to, 0), xtmp);
5562 vmovdqu(Address(to, 32), xtmp);
5563 addptr(to, 64);
5564 subl(count, 16 << shift);
5565 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5566
5567 BIND(L_check_fill_32_bytes);
5568 addl(count, 8 << shift);
5569 jccb(Assembler::less, L_check_fill_8_bytes);
5570 vmovdqu(Address(to, 0), xtmp);
5571 addptr(to, 32);
5572 subl(count, 8 << shift);
5573
5574 BIND(L_check_fill_8_bytes);
5575 // clean upper bits of YMM registers
5576 movdl(xtmp, value);
5577 pshufd(xtmp, xtmp, 0);
5578 } else {
5579 // Fill 32-byte chunks
5580 pshufd(xtmp, xtmp, 0);
5581
5582 subl(count, 8 << shift);
5583 jcc(Assembler::less, L_check_fill_8_bytes);
5584 align(16);
5585
5586 BIND(L_fill_32_bytes_loop);
5587
5588 if (UseUnalignedLoadStores) {
5589 movdqu(Address(to, 0), xtmp);
5590 movdqu(Address(to, 16), xtmp);
5591 } else {
5592 movq(Address(to, 0), xtmp);
5593 movq(Address(to, 8), xtmp);
5594 movq(Address(to, 16), xtmp);
5595 movq(Address(to, 24), xtmp);
5596 }
5597
5598 addptr(to, 32);
5599 subl(count, 8 << shift);
5600 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5601
5602 BIND(L_check_fill_8_bytes);
5603 }
5604 addl(count, 8 << shift);
5605 jccb(Assembler::zero, L_exit);
5606 jmpb(L_fill_8_bytes);
5607
5608 //
5609 // length is too short, just fill qwords
5610 //
5611 BIND(L_fill_8_bytes_loop);
5612 movq(Address(to, 0), xtmp);
5613 addptr(to, 8);
5614 BIND(L_fill_8_bytes);
5615 subl(count, 1 << (shift + 1));
5616 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5617 }
5618 }
5619 // fill trailing 4 bytes
5620 BIND(L_fill_4_bytes);
5621 testl(count, 1<<shift);
5622 jccb(Assembler::zero, L_fill_2_bytes);
5623 movl(Address(to, 0), value);
5624 if (t == T_BYTE || t == T_SHORT) {
5625 Label L_fill_byte;
5626 addptr(to, 4);
5627 BIND(L_fill_2_bytes);
5628 // fill trailing 2 bytes
5629 testl(count, 1<<(shift-1));
5630 jccb(Assembler::zero, L_fill_byte);
5631 movw(Address(to, 0), value);
5632 if (t == T_BYTE) {
5633 addptr(to, 2);
5634 BIND(L_fill_byte);
5635 // fill trailing byte
5636 testl(count, 1);
5637 jccb(Assembler::zero, L_exit);
5638 movb(Address(to, 0), value);
5639 } else {
5640 BIND(L_fill_byte);
5641 }
5642 } else {
5643 BIND(L_fill_2_bytes);
5644 }
5645 BIND(L_exit);
5646 }
5647
5648 // encode char[] to byte[] in ISO_8859_1 or ASCII
5649 //@IntrinsicCandidate
5650 //private static int implEncodeISOArray(byte[] sa, int sp,
5651 //byte[] da, int dp, int len) {
5652 // int i = 0;
5653 // for (; i < len; i++) {
5654 // char c = StringUTF16.getChar(sa, sp++);
5655 // if (c > '\u00FF')
5656 // break;
5657 // da[dp++] = (byte)c;
5658 // }
5659 // return i;
5660 //}
5661 //
5662 //@IntrinsicCandidate
5663 //private static int implEncodeAsciiArray(char[] sa, int sp,
5664 // byte[] da, int dp, int len) {
5665 // int i = 0;
5666 // for (; i < len; i++) {
5667 // char c = sa[sp++];
5668 // if (c >= '\u0080')
5669 // break;
5670 // da[dp++] = (byte)c;
5671 // }
5672 // return i;
5673 //}
5674 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5675 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5676 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5677 Register tmp5, Register result, bool ascii) {
5678
5679 // rsi: src
5680 // rdi: dst
5681 // rdx: len
5682 // rcx: tmp5
5683 // rax: result
5684 ShortBranchVerifier sbv(this);
5685 assert_different_registers(src, dst, len, tmp5, result);
5686 Label L_done, L_copy_1_char, L_copy_1_char_exit;
5687
5688 int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5689 int short_mask = ascii ? 0xff80 : 0xff00;
5690
5691 // set result
5692 xorl(result, result);
5693 // check for zero length
5694 testl(len, len);
5695 jcc(Assembler::zero, L_done);
5696
5697 movl(result, len);
5698
5699 // Setup pointers
5700 lea(src, Address(src, len, Address::times_2)); // char[]
5701 lea(dst, Address(dst, len, Address::times_1)); // byte[]
5702 negptr(len);
5703
5704 if (UseSSE42Intrinsics || UseAVX >= 2) {
5705 Label L_copy_8_chars, L_copy_8_chars_exit;
5706 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5707
5708 if (UseAVX >= 2) {
5709 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5710 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
5711 movdl(tmp1Reg, tmp5);
5712 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5713 jmp(L_chars_32_check);
5714
5715 bind(L_copy_32_chars);
5716 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5717 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5718 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5719 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
5720 jccb(Assembler::notZero, L_copy_32_chars_exit);
5721 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5722 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5723 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5724
5725 bind(L_chars_32_check);
5726 addptr(len, 32);
5727 jcc(Assembler::lessEqual, L_copy_32_chars);
5728
5729 bind(L_copy_32_chars_exit);
5730 subptr(len, 16);
5731 jccb(Assembler::greater, L_copy_16_chars_exit);
5732
5733 } else if (UseSSE42Intrinsics) {
5734 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
5735 movdl(tmp1Reg, tmp5);
5736 pshufd(tmp1Reg, tmp1Reg, 0);
5737 jmpb(L_chars_16_check);
5738 }
5739
5740 bind(L_copy_16_chars);
5741 if (UseAVX >= 2) {
5742 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5743 vptest(tmp2Reg, tmp1Reg);
5744 jcc(Assembler::notZero, L_copy_16_chars_exit);
5745 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5746 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5747 } else {
5748 if (UseAVX > 0) {
5749 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5750 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5751 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5752 } else {
5753 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5754 por(tmp2Reg, tmp3Reg);
5755 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5756 por(tmp2Reg, tmp4Reg);
5757 }
5758 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
5759 jccb(Assembler::notZero, L_copy_16_chars_exit);
5760 packuswb(tmp3Reg, tmp4Reg);
5761 }
5762 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5763
5764 bind(L_chars_16_check);
5765 addptr(len, 16);
5766 jcc(Assembler::lessEqual, L_copy_16_chars);
5767
5768 bind(L_copy_16_chars_exit);
5769 if (UseAVX >= 2) {
5770 // clean upper bits of YMM registers
5771 vpxor(tmp2Reg, tmp2Reg);
5772 vpxor(tmp3Reg, tmp3Reg);
5773 vpxor(tmp4Reg, tmp4Reg);
5774 movdl(tmp1Reg, tmp5);
5775 pshufd(tmp1Reg, tmp1Reg, 0);
5776 }
5777 subptr(len, 8);
5778 jccb(Assembler::greater, L_copy_8_chars_exit);
5779
5780 bind(L_copy_8_chars);
5781 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5782 ptest(tmp3Reg, tmp1Reg);
5783 jccb(Assembler::notZero, L_copy_8_chars_exit);
5784 packuswb(tmp3Reg, tmp1Reg);
5785 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5786 addptr(len, 8);
5787 jccb(Assembler::lessEqual, L_copy_8_chars);
5788
5789 bind(L_copy_8_chars_exit);
5790 subptr(len, 8);
5791 jccb(Assembler::zero, L_done);
5792 }
5793
5794 bind(L_copy_1_char);
5795 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5796 testl(tmp5, short_mask); // check if Unicode or non-ASCII char
5797 jccb(Assembler::notZero, L_copy_1_char_exit);
5798 movb(Address(dst, len, Address::times_1, 0), tmp5);
5799 addptr(len, 1);
5800 jccb(Assembler::less, L_copy_1_char);
5801
5802 bind(L_copy_1_char_exit);
5803 addptr(result, len); // len is negative count of not processed elements
5804
5805 bind(L_done);
5806 }
5807
5808 #ifdef _LP64
5809 /**
5810 * Helper for multiply_to_len().
5811 */
5812 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5813 addq(dest_lo, src1);
5814 adcq(dest_hi, 0);
5815 addq(dest_lo, src2);
5816 adcq(dest_hi, 0);
5817 }
5818
5819 /**
5820 * Multiply 64 bit by 64 bit first loop.
5821 */
5822 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5823 Register y, Register y_idx, Register z,
5824 Register carry, Register product,
5825 Register idx, Register kdx) {
5826 //
5827 // jlong carry, x[], y[], z[];
5828 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5829 // huge_128 product = y[idx] * x[xstart] + carry;
5830 // z[kdx] = (jlong)product;
5831 // carry = (jlong)(product >>> 64);
5832 // }
5833 // z[xstart] = carry;
5834 //
5835
5836 Label L_first_loop, L_first_loop_exit;
5837 Label L_one_x, L_one_y, L_multiply;
5838
5839 decrementl(xstart);
5840 jcc(Assembler::negative, L_one_x);
5841
5842 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
5843 rorq(x_xstart, 32); // convert big-endian to little-endian
5844
5845 bind(L_first_loop);
5846 decrementl(idx);
5847 jcc(Assembler::negative, L_first_loop_exit);
5848 decrementl(idx);
5849 jcc(Assembler::negative, L_one_y);
5850 movq(y_idx, Address(y, idx, Address::times_4, 0));
5851 rorq(y_idx, 32); // convert big-endian to little-endian
5852 bind(L_multiply);
5853 movq(product, x_xstart);
5854 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5855 addq(product, carry);
5856 adcq(rdx, 0);
5857 subl(kdx, 2);
5858 movl(Address(z, kdx, Address::times_4, 4), product);
5859 shrq(product, 32);
5860 movl(Address(z, kdx, Address::times_4, 0), product);
5861 movq(carry, rdx);
5862 jmp(L_first_loop);
5863
5864 bind(L_one_y);
5865 movl(y_idx, Address(y, 0));
5866 jmp(L_multiply);
5867
5868 bind(L_one_x);
5869 movl(x_xstart, Address(x, 0));
5870 jmp(L_first_loop);
5871
5872 bind(L_first_loop_exit);
5873 }
5874
5875 /**
5876 * Multiply 64 bit by 64 bit and add 128 bit.
5877 */
5878 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5879 Register yz_idx, Register idx,
5880 Register carry, Register product, int offset) {
5881 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5882 // z[kdx] = (jlong)product;
5883
5884 movq(yz_idx, Address(y, idx, Address::times_4, offset));
5885 rorq(yz_idx, 32); // convert big-endian to little-endian
5886 movq(product, x_xstart);
5887 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5888 movq(yz_idx, Address(z, idx, Address::times_4, offset));
5889 rorq(yz_idx, 32); // convert big-endian to little-endian
5890
5891 add2_with_carry(rdx, product, carry, yz_idx);
5892
5893 movl(Address(z, idx, Address::times_4, offset+4), product);
5894 shrq(product, 32);
5895 movl(Address(z, idx, Address::times_4, offset), product);
5896
5897 }
5898
5899 /**
5900 * Multiply 128 bit by 128 bit. Unrolled inner loop.
5901 */
5902 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5903 Register yz_idx, Register idx, Register jdx,
5904 Register carry, Register product,
5905 Register carry2) {
5906 // jlong carry, x[], y[], z[];
5907 // int kdx = ystart+1;
5908 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5909 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5910 // z[kdx+idx+1] = (jlong)product;
5911 // jlong carry2 = (jlong)(product >>> 64);
5912 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5913 // z[kdx+idx] = (jlong)product;
5914 // carry = (jlong)(product >>> 64);
5915 // }
5916 // idx += 2;
5917 // if (idx > 0) {
5918 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5919 // z[kdx+idx] = (jlong)product;
5920 // carry = (jlong)(product >>> 64);
5921 // }
5922 //
5923
5924 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5925
5926 movl(jdx, idx);
5927 andl(jdx, 0xFFFFFFFC);
5928 shrl(jdx, 2);
5929
5930 bind(L_third_loop);
5931 subl(jdx, 1);
5932 jcc(Assembler::negative, L_third_loop_exit);
5933 subl(idx, 4);
5934
5935 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
5936 movq(carry2, rdx);
5937
5938 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
5939 movq(carry, rdx);
5940 jmp(L_third_loop);
5941
5942 bind (L_third_loop_exit);
5943
5944 andl (idx, 0x3);
5945 jcc(Assembler::zero, L_post_third_loop_done);
5946
5947 Label L_check_1;
5948 subl(idx, 2);
5949 jcc(Assembler::negative, L_check_1);
5950
5951 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
5952 movq(carry, rdx);
5953
5954 bind (L_check_1);
5955 addl (idx, 0x2);
5956 andl (idx, 0x1);
5957 subl(idx, 1);
5958 jcc(Assembler::negative, L_post_third_loop_done);
5959
5960 movl(yz_idx, Address(y, idx, Address::times_4, 0));
5961 movq(product, x_xstart);
5962 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5963 movl(yz_idx, Address(z, idx, Address::times_4, 0));
5964
5965 add2_with_carry(rdx, product, yz_idx, carry);
5966
5967 movl(Address(z, idx, Address::times_4, 0), product);
5968 shrq(product, 32);
5969
5970 shlq(rdx, 32);
5971 orq(product, rdx);
5972 movq(carry, product);
5973
5974 bind(L_post_third_loop_done);
5975 }
5976
5977 /**
5978 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
5979 *
5980 */
5981 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
5982 Register carry, Register carry2,
5983 Register idx, Register jdx,
5984 Register yz_idx1, Register yz_idx2,
5985 Register tmp, Register tmp3, Register tmp4) {
5986 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
5987
5988 // jlong carry, x[], y[], z[];
5989 // int kdx = ystart+1;
5990 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5991 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
5992 // jlong carry2 = (jlong)(tmp3 >>> 64);
5993 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
5994 // carry = (jlong)(tmp4 >>> 64);
5995 // z[kdx+idx+1] = (jlong)tmp3;
5996 // z[kdx+idx] = (jlong)tmp4;
5997 // }
5998 // idx += 2;
5999 // if (idx > 0) {
6000 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6001 // z[kdx+idx] = (jlong)yz_idx1;
6002 // carry = (jlong)(yz_idx1 >>> 64);
6003 // }
6004 //
6005
6006 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6007
6008 movl(jdx, idx);
6009 andl(jdx, 0xFFFFFFFC);
6010 shrl(jdx, 2);
6011
6012 bind(L_third_loop);
6013 subl(jdx, 1);
6014 jcc(Assembler::negative, L_third_loop_exit);
6015 subl(idx, 4);
6016
6017 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
6018 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6019 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
6020 rorxq(yz_idx2, yz_idx2, 32);
6021
6022 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6023 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
6024
6025 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
6026 rorxq(yz_idx1, yz_idx1, 32);
6027 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6028 rorxq(yz_idx2, yz_idx2, 32);
6029
6030 if (VM_Version::supports_adx()) {
6031 adcxq(tmp3, carry);
6032 adoxq(tmp3, yz_idx1);
6033
6034 adcxq(tmp4, tmp);
6035 adoxq(tmp4, yz_idx2);
6036
6037 movl(carry, 0); // does not affect flags
6038 adcxq(carry2, carry);
6039 adoxq(carry2, carry);
6040 } else {
6041 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6042 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6043 }
6044 movq(carry, carry2);
6045
6046 movl(Address(z, idx, Address::times_4, 12), tmp3);
6047 shrq(tmp3, 32);
6048 movl(Address(z, idx, Address::times_4, 8), tmp3);
6049
6050 movl(Address(z, idx, Address::times_4, 4), tmp4);
6051 shrq(tmp4, 32);
6052 movl(Address(z, idx, Address::times_4, 0), tmp4);
6053
6054 jmp(L_third_loop);
6055
6056 bind (L_third_loop_exit);
6057
6058 andl (idx, 0x3);
6059 jcc(Assembler::zero, L_post_third_loop_done);
6060
6061 Label L_check_1;
6062 subl(idx, 2);
6063 jcc(Assembler::negative, L_check_1);
6064
6065 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
6066 rorxq(yz_idx1, yz_idx1, 32);
6067 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6068 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6069 rorxq(yz_idx2, yz_idx2, 32);
6070
6071 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6072
6073 movl(Address(z, idx, Address::times_4, 4), tmp3);
6074 shrq(tmp3, 32);
6075 movl(Address(z, idx, Address::times_4, 0), tmp3);
6076 movq(carry, tmp4);
6077
6078 bind (L_check_1);
6079 addl (idx, 0x2);
6080 andl (idx, 0x1);
6081 subl(idx, 1);
6082 jcc(Assembler::negative, L_post_third_loop_done);
6083 movl(tmp4, Address(y, idx, Address::times_4, 0));
6084 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
6085 movl(tmp4, Address(z, idx, Address::times_4, 0));
6086
6087 add2_with_carry(carry2, tmp3, tmp4, carry);
6088
6089 movl(Address(z, idx, Address::times_4, 0), tmp3);
6090 shrq(tmp3, 32);
6091
6092 shlq(carry2, 32);
6093 orq(tmp3, carry2);
6094 movq(carry, tmp3);
6095
6096 bind(L_post_third_loop_done);
6097 }
6098
6099 /**
6100 * Code for BigInteger::multiplyToLen() instrinsic.
6101 *
6102 * rdi: x
6103 * rax: xlen
6104 * rsi: y
6105 * rcx: ylen
6106 * r8: z
6107 * r11: zlen
6108 * r12: tmp1
6109 * r13: tmp2
6110 * r14: tmp3
6111 * r15: tmp4
6112 * rbx: tmp5
6113 *
6114 */
6115 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6116 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6117 ShortBranchVerifier sbv(this);
6118 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6119
6120 push(tmp1);
6121 push(tmp2);
6122 push(tmp3);
6123 push(tmp4);
6124 push(tmp5);
6125
6126 push(xlen);
6127 push(zlen);
6128
6129 const Register idx = tmp1;
6130 const Register kdx = tmp2;
6131 const Register xstart = tmp3;
6132
6133 const Register y_idx = tmp4;
6134 const Register carry = tmp5;
6135 const Register product = xlen;
6136 const Register x_xstart = zlen; // reuse register
6137
6138 // First Loop.
6139 //
6140 // final static long LONG_MASK = 0xffffffffL;
6141 // int xstart = xlen - 1;
6142 // int ystart = ylen - 1;
6143 // long carry = 0;
6144 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6145 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6146 // z[kdx] = (int)product;
6147 // carry = product >>> 32;
6148 // }
6149 // z[xstart] = (int)carry;
6150 //
6151
6152 movl(idx, ylen); // idx = ylen;
6153 movl(kdx, zlen); // kdx = xlen+ylen;
6154 xorq(carry, carry); // carry = 0;
6155
6156 Label L_done;
6157
6158 movl(xstart, xlen);
6159 decrementl(xstart);
6160 jcc(Assembler::negative, L_done);
6161
6162 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6163
6164 Label L_second_loop;
6165 testl(kdx, kdx);
6166 jcc(Assembler::zero, L_second_loop);
6167
6168 Label L_carry;
6169 subl(kdx, 1);
6170 jcc(Assembler::zero, L_carry);
6171
6172 movl(Address(z, kdx, Address::times_4, 0), carry);
6173 shrq(carry, 32);
6174 subl(kdx, 1);
6175
6176 bind(L_carry);
6177 movl(Address(z, kdx, Address::times_4, 0), carry);
6178
6179 // Second and third (nested) loops.
6180 //
6181 // for (int i = xstart-1; i >= 0; i--) { // Second loop
6182 // carry = 0;
6183 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6184 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6185 // (z[k] & LONG_MASK) + carry;
6186 // z[k] = (int)product;
6187 // carry = product >>> 32;
6188 // }
6189 // z[i] = (int)carry;
6190 // }
6191 //
6192 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6193
6194 const Register jdx = tmp1;
6195
6196 bind(L_second_loop);
6197 xorl(carry, carry); // carry = 0;
6198 movl(jdx, ylen); // j = ystart+1
6199
6200 subl(xstart, 1); // i = xstart-1;
6201 jcc(Assembler::negative, L_done);
6202
6203 push (z);
6204
6205 Label L_last_x;
6206 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6207 subl(xstart, 1); // i = xstart-1;
6208 jcc(Assembler::negative, L_last_x);
6209
6210 if (UseBMI2Instructions) {
6211 movq(rdx, Address(x, xstart, Address::times_4, 0));
6212 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6213 } else {
6214 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6215 rorq(x_xstart, 32); // convert big-endian to little-endian
6216 }
6217
6218 Label L_third_loop_prologue;
6219 bind(L_third_loop_prologue);
6220
6221 push (x);
6222 push (xstart);
6223 push (ylen);
6224
6225
6226 if (UseBMI2Instructions) {
6227 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6228 } else { // !UseBMI2Instructions
6229 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6230 }
6231
6232 pop(ylen);
6233 pop(xlen);
6234 pop(x);
6235 pop(z);
6236
6237 movl(tmp3, xlen);
6238 addl(tmp3, 1);
6239 movl(Address(z, tmp3, Address::times_4, 0), carry);
6240 subl(tmp3, 1);
6241 jccb(Assembler::negative, L_done);
6242
6243 shrq(carry, 32);
6244 movl(Address(z, tmp3, Address::times_4, 0), carry);
6245 jmp(L_second_loop);
6246
6247 // Next infrequent code is moved outside loops.
6248 bind(L_last_x);
6249 if (UseBMI2Instructions) {
6250 movl(rdx, Address(x, 0));
6251 } else {
6252 movl(x_xstart, Address(x, 0));
6253 }
6254 jmp(L_third_loop_prologue);
6255
6256 bind(L_done);
6257
6258 pop(zlen);
6259 pop(xlen);
6260
6261 pop(tmp5);
6262 pop(tmp4);
6263 pop(tmp3);
6264 pop(tmp2);
6265 pop(tmp1);
6266 }
6267
6268 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6269 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6270 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6271 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6272 Label VECTOR8_TAIL, VECTOR4_TAIL;
6273 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6274 Label SAME_TILL_END, DONE;
6275 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6276
6277 //scale is in rcx in both Win64 and Unix
6278 ShortBranchVerifier sbv(this);
6279
6280 shlq(length);
6281 xorq(result, result);
6282
6283 if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6284 VM_Version::supports_avx512vlbw()) {
6285 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6286
6287 cmpq(length, 64);
6288 jcc(Assembler::less, VECTOR32_TAIL);
6289
6290 movq(tmp1, length);
6291 andq(tmp1, 0x3F); // tail count
6292 andq(length, ~(0x3F)); //vector count
6293
6294 bind(VECTOR64_LOOP);
6295 // AVX512 code to compare 64 byte vectors.
6296 evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit);
6297 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6298 kortestql(k7, k7);
6299 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
6300 addq(result, 64);
6301 subq(length, 64);
6302 jccb(Assembler::notZero, VECTOR64_LOOP);
6303
6304 //bind(VECTOR64_TAIL);
6305 testq(tmp1, tmp1);
6306 jcc(Assembler::zero, SAME_TILL_END);
6307
6308 //bind(VECTOR64_TAIL);
6309 // AVX512 code to compare upto 63 byte vectors.
6310 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6311 shlxq(tmp2, tmp2, tmp1);
6312 notq(tmp2);
6313 kmovql(k3, tmp2);
6314
6315 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6316 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6317
6318 ktestql(k7, k3);
6319 jcc(Assembler::below, SAME_TILL_END); // not mismatch
6320
6321 bind(VECTOR64_NOT_EQUAL);
6322 kmovql(tmp1, k7);
6323 notq(tmp1);
6324 tzcntq(tmp1, tmp1);
6325 addq(result, tmp1);
6326 shrq(result);
6327 jmp(DONE);
6328 bind(VECTOR32_TAIL);
6329 }
6330
6331 cmpq(length, 8);
6332 jcc(Assembler::equal, VECTOR8_LOOP);
6333 jcc(Assembler::less, VECTOR4_TAIL);
6334
6335 if (UseAVX >= 2) {
6336 Label VECTOR16_TAIL, VECTOR32_LOOP;
6337
6338 cmpq(length, 16);
6339 jcc(Assembler::equal, VECTOR16_LOOP);
6340 jcc(Assembler::less, VECTOR8_LOOP);
6341
6342 cmpq(length, 32);
6343 jccb(Assembler::less, VECTOR16_TAIL);
6344
6345 subq(length, 32);
6346 bind(VECTOR32_LOOP);
6347 vmovdqu(rymm0, Address(obja, result));
6348 vmovdqu(rymm1, Address(objb, result));
6349 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6350 vptest(rymm2, rymm2);
6351 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6352 addq(result, 32);
6353 subq(length, 32);
6354 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6355 addq(length, 32);
6356 jcc(Assembler::equal, SAME_TILL_END);
6357 //falling through if less than 32 bytes left //close the branch here.
6358
6359 bind(VECTOR16_TAIL);
6360 cmpq(length, 16);
6361 jccb(Assembler::less, VECTOR8_TAIL);
6362 bind(VECTOR16_LOOP);
6363 movdqu(rymm0, Address(obja, result));
6364 movdqu(rymm1, Address(objb, result));
6365 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6366 ptest(rymm2, rymm2);
6367 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6368 addq(result, 16);
6369 subq(length, 16);
6370 jcc(Assembler::equal, SAME_TILL_END);
6371 //falling through if less than 16 bytes left
6372 } else {//regular intrinsics
6373
6374 cmpq(length, 16);
6375 jccb(Assembler::less, VECTOR8_TAIL);
6376
6377 subq(length, 16);
6378 bind(VECTOR16_LOOP);
6379 movdqu(rymm0, Address(obja, result));
6380 movdqu(rymm1, Address(objb, result));
6381 pxor(rymm0, rymm1);
6382 ptest(rymm0, rymm0);
6383 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6384 addq(result, 16);
6385 subq(length, 16);
6386 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6387 addq(length, 16);
6388 jcc(Assembler::equal, SAME_TILL_END);
6389 //falling through if less than 16 bytes left
6390 }
6391
6392 bind(VECTOR8_TAIL);
6393 cmpq(length, 8);
6394 jccb(Assembler::less, VECTOR4_TAIL);
6395 bind(VECTOR8_LOOP);
6396 movq(tmp1, Address(obja, result));
6397 movq(tmp2, Address(objb, result));
6398 xorq(tmp1, tmp2);
6399 testq(tmp1, tmp1);
6400 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6401 addq(result, 8);
6402 subq(length, 8);
6403 jcc(Assembler::equal, SAME_TILL_END);
6404 //falling through if less than 8 bytes left
6405
6406 bind(VECTOR4_TAIL);
6407 cmpq(length, 4);
6408 jccb(Assembler::less, BYTES_TAIL);
6409 bind(VECTOR4_LOOP);
6410 movl(tmp1, Address(obja, result));
6411 xorl(tmp1, Address(objb, result));
6412 testl(tmp1, tmp1);
6413 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6414 addq(result, 4);
6415 subq(length, 4);
6416 jcc(Assembler::equal, SAME_TILL_END);
6417 //falling through if less than 4 bytes left
6418
6419 bind(BYTES_TAIL);
6420 bind(BYTES_LOOP);
6421 load_unsigned_byte(tmp1, Address(obja, result));
6422 load_unsigned_byte(tmp2, Address(objb, result));
6423 xorl(tmp1, tmp2);
6424 testl(tmp1, tmp1);
6425 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6426 decq(length);
6427 jcc(Assembler::zero, SAME_TILL_END);
6428 incq(result);
6429 load_unsigned_byte(tmp1, Address(obja, result));
6430 load_unsigned_byte(tmp2, Address(objb, result));
6431 xorl(tmp1, tmp2);
6432 testl(tmp1, tmp1);
6433 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6434 decq(length);
6435 jcc(Assembler::zero, SAME_TILL_END);
6436 incq(result);
6437 load_unsigned_byte(tmp1, Address(obja, result));
6438 load_unsigned_byte(tmp2, Address(objb, result));
6439 xorl(tmp1, tmp2);
6440 testl(tmp1, tmp1);
6441 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6442 jmp(SAME_TILL_END);
6443
6444 if (UseAVX >= 2) {
6445 bind(VECTOR32_NOT_EQUAL);
6446 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6447 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6448 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6449 vpmovmskb(tmp1, rymm0);
6450 bsfq(tmp1, tmp1);
6451 addq(result, tmp1);
6452 shrq(result);
6453 jmp(DONE);
6454 }
6455
6456 bind(VECTOR16_NOT_EQUAL);
6457 if (UseAVX >= 2) {
6458 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6459 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6460 pxor(rymm0, rymm2);
6461 } else {
6462 pcmpeqb(rymm2, rymm2);
6463 pxor(rymm0, rymm1);
6464 pcmpeqb(rymm0, rymm1);
6465 pxor(rymm0, rymm2);
6466 }
6467 pmovmskb(tmp1, rymm0);
6468 bsfq(tmp1, tmp1);
6469 addq(result, tmp1);
6470 shrq(result);
6471 jmpb(DONE);
6472
6473 bind(VECTOR8_NOT_EQUAL);
6474 bind(VECTOR4_NOT_EQUAL);
6475 bsfq(tmp1, tmp1);
6476 shrq(tmp1, 3);
6477 addq(result, tmp1);
6478 bind(BYTES_NOT_EQUAL);
6479 shrq(result);
6480 jmpb(DONE);
6481
6482 bind(SAME_TILL_END);
6483 mov64(result, -1);
6484
6485 bind(DONE);
6486 }
6487
6488 //Helper functions for square_to_len()
6489
6490 /**
6491 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6492 * Preserves x and z and modifies rest of the registers.
6493 */
6494 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6495 // Perform square and right shift by 1
6496 // Handle odd xlen case first, then for even xlen do the following
6497 // jlong carry = 0;
6498 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6499 // huge_128 product = x[j:j+1] * x[j:j+1];
6500 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6501 // z[i+2:i+3] = (jlong)(product >>> 1);
6502 // carry = (jlong)product;
6503 // }
6504
6505 xorq(tmp5, tmp5); // carry
6506 xorq(rdxReg, rdxReg);
6507 xorl(tmp1, tmp1); // index for x
6508 xorl(tmp4, tmp4); // index for z
6509
6510 Label L_first_loop, L_first_loop_exit;
6511
6512 testl(xlen, 1);
6513 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6514
6515 // Square and right shift by 1 the odd element using 32 bit multiply
6516 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6517 imulq(raxReg, raxReg);
6518 shrq(raxReg, 1);
6519 adcq(tmp5, 0);
6520 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6521 incrementl(tmp1);
6522 addl(tmp4, 2);
6523
6524 // Square and right shift by 1 the rest using 64 bit multiply
6525 bind(L_first_loop);
6526 cmpptr(tmp1, xlen);
6527 jccb(Assembler::equal, L_first_loop_exit);
6528
6529 // Square
6530 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
6531 rorq(raxReg, 32); // convert big-endian to little-endian
6532 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
6533
6534 // Right shift by 1 and save carry
6535 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6536 rcrq(rdxReg, 1);
6537 rcrq(raxReg, 1);
6538 adcq(tmp5, 0);
6539
6540 // Store result in z
6541 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6542 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6543
6544 // Update indices for x and z
6545 addl(tmp1, 2);
6546 addl(tmp4, 4);
6547 jmp(L_first_loop);
6548
6549 bind(L_first_loop_exit);
6550 }
6551
6552
6553 /**
6554 * Perform the following multiply add operation using BMI2 instructions
6555 * carry:sum = sum + op1*op2 + carry
6556 * op2 should be in rdx
6557 * op2 is preserved, all other registers are modified
6558 */
6559 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6560 // assert op2 is rdx
6561 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
6562 addq(sum, carry);
6563 adcq(tmp2, 0);
6564 addq(sum, op1);
6565 adcq(tmp2, 0);
6566 movq(carry, tmp2);
6567 }
6568
6569 /**
6570 * Perform the following multiply add operation:
6571 * carry:sum = sum + op1*op2 + carry
6572 * Preserves op1, op2 and modifies rest of registers
6573 */
6574 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6575 // rdx:rax = op1 * op2
6576 movq(raxReg, op2);
6577 mulq(op1);
6578
6579 // rdx:rax = sum + carry + rdx:rax
6580 addq(sum, carry);
6581 adcq(rdxReg, 0);
6582 addq(sum, raxReg);
6583 adcq(rdxReg, 0);
6584
6585 // carry:sum = rdx:sum
6586 movq(carry, rdxReg);
6587 }
6588
6589 /**
6590 * Add 64 bit long carry into z[] with carry propogation.
6591 * Preserves z and carry register values and modifies rest of registers.
6592 *
6593 */
6594 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6595 Label L_fourth_loop, L_fourth_loop_exit;
6596
6597 movl(tmp1, 1);
6598 subl(zlen, 2);
6599 addq(Address(z, zlen, Address::times_4, 0), carry);
6600
6601 bind(L_fourth_loop);
6602 jccb(Assembler::carryClear, L_fourth_loop_exit);
6603 subl(zlen, 2);
6604 jccb(Assembler::negative, L_fourth_loop_exit);
6605 addq(Address(z, zlen, Address::times_4, 0), tmp1);
6606 jmp(L_fourth_loop);
6607 bind(L_fourth_loop_exit);
6608 }
6609
6610 /**
6611 * Shift z[] left by 1 bit.
6612 * Preserves x, len, z and zlen registers and modifies rest of the registers.
6613 *
6614 */
6615 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6616
6617 Label L_fifth_loop, L_fifth_loop_exit;
6618
6619 // Fifth loop
6620 // Perform primitiveLeftShift(z, zlen, 1)
6621
6622 const Register prev_carry = tmp1;
6623 const Register new_carry = tmp4;
6624 const Register value = tmp2;
6625 const Register zidx = tmp3;
6626
6627 // int zidx, carry;
6628 // long value;
6629 // carry = 0;
6630 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6631 // (carry:value) = (z[i] << 1) | carry ;
6632 // z[i] = value;
6633 // }
6634
6635 movl(zidx, zlen);
6636 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6637
6638 bind(L_fifth_loop);
6639 decl(zidx); // Use decl to preserve carry flag
6640 decl(zidx);
6641 jccb(Assembler::negative, L_fifth_loop_exit);
6642
6643 if (UseBMI2Instructions) {
6644 movq(value, Address(z, zidx, Address::times_4, 0));
6645 rclq(value, 1);
6646 rorxq(value, value, 32);
6647 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
6648 }
6649 else {
6650 // clear new_carry
6651 xorl(new_carry, new_carry);
6652
6653 // Shift z[i] by 1, or in previous carry and save new carry
6654 movq(value, Address(z, zidx, Address::times_4, 0));
6655 shlq(value, 1);
6656 adcl(new_carry, 0);
6657
6658 orq(value, prev_carry);
6659 rorq(value, 0x20);
6660 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
6661
6662 // Set previous carry = new carry
6663 movl(prev_carry, new_carry);
6664 }
6665 jmp(L_fifth_loop);
6666
6667 bind(L_fifth_loop_exit);
6668 }
6669
6670
6671 /**
6672 * Code for BigInteger::squareToLen() intrinsic
6673 *
6674 * rdi: x
6675 * rsi: len
6676 * r8: z
6677 * rcx: zlen
6678 * r12: tmp1
6679 * r13: tmp2
6680 * r14: tmp3
6681 * r15: tmp4
6682 * rbx: tmp5
6683 *
6684 */
6685 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6686
6687 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6688 push(tmp1);
6689 push(tmp2);
6690 push(tmp3);
6691 push(tmp4);
6692 push(tmp5);
6693
6694 // First loop
6695 // Store the squares, right shifted one bit (i.e., divided by 2).
6696 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6697
6698 // Add in off-diagonal sums.
6699 //
6700 // Second, third (nested) and fourth loops.
6701 // zlen +=2;
6702 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6703 // carry = 0;
6704 // long op2 = x[xidx:xidx+1];
6705 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6706 // k -= 2;
6707 // long op1 = x[j:j+1];
6708 // long sum = z[k:k+1];
6709 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6710 // z[k:k+1] = sum;
6711 // }
6712 // add_one_64(z, k, carry, tmp_regs);
6713 // }
6714
6715 const Register carry = tmp5;
6716 const Register sum = tmp3;
6717 const Register op1 = tmp4;
6718 Register op2 = tmp2;
6719
6720 push(zlen);
6721 push(len);
6722 addl(zlen,2);
6723 bind(L_second_loop);
6724 xorq(carry, carry);
6725 subl(zlen, 4);
6726 subl(len, 2);
6727 push(zlen);
6728 push(len);
6729 cmpl(len, 0);
6730 jccb(Assembler::lessEqual, L_second_loop_exit);
6731
6732 // Multiply an array by one 64 bit long.
6733 if (UseBMI2Instructions) {
6734 op2 = rdxReg;
6735 movq(op2, Address(x, len, Address::times_4, 0));
6736 rorxq(op2, op2, 32);
6737 }
6738 else {
6739 movq(op2, Address(x, len, Address::times_4, 0));
6740 rorq(op2, 32);
6741 }
6742
6743 bind(L_third_loop);
6744 decrementl(len);
6745 jccb(Assembler::negative, L_third_loop_exit);
6746 decrementl(len);
6747 jccb(Assembler::negative, L_last_x);
6748
6749 movq(op1, Address(x, len, Address::times_4, 0));
6750 rorq(op1, 32);
6751
6752 bind(L_multiply);
6753 subl(zlen, 2);
6754 movq(sum, Address(z, zlen, Address::times_4, 0));
6755
6756 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6757 if (UseBMI2Instructions) {
6758 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6759 }
6760 else {
6761 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6762 }
6763
6764 movq(Address(z, zlen, Address::times_4, 0), sum);
6765
6766 jmp(L_third_loop);
6767 bind(L_third_loop_exit);
6768
6769 // Fourth loop
6770 // Add 64 bit long carry into z with carry propogation.
6771 // Uses offsetted zlen.
6772 add_one_64(z, zlen, carry, tmp1);
6773
6774 pop(len);
6775 pop(zlen);
6776 jmp(L_second_loop);
6777
6778 // Next infrequent code is moved outside loops.
6779 bind(L_last_x);
6780 movl(op1, Address(x, 0));
6781 jmp(L_multiply);
6782
6783 bind(L_second_loop_exit);
6784 pop(len);
6785 pop(zlen);
6786 pop(len);
6787 pop(zlen);
6788
6789 // Fifth loop
6790 // Shift z left 1 bit.
6791 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6792
6793 // z[zlen-1] |= x[len-1] & 1;
6794 movl(tmp3, Address(x, len, Address::times_4, -4));
6795 andl(tmp3, 1);
6796 orl(Address(z, zlen, Address::times_4, -4), tmp3);
6797
6798 pop(tmp5);
6799 pop(tmp4);
6800 pop(tmp3);
6801 pop(tmp2);
6802 pop(tmp1);
6803 }
6804
6805 /**
6806 * Helper function for mul_add()
6807 * Multiply the in[] by int k and add to out[] starting at offset offs using
6808 * 128 bit by 32 bit multiply and return the carry in tmp5.
6809 * Only quad int aligned length of in[] is operated on in this function.
6810 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6811 * This function preserves out, in and k registers.
6812 * len and offset point to the appropriate index in "in" & "out" correspondingly
6813 * tmp5 has the carry.
6814 * other registers are temporary and are modified.
6815 *
6816 */
6817 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6818 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6819 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6820
6821 Label L_first_loop, L_first_loop_exit;
6822
6823 movl(tmp1, len);
6824 shrl(tmp1, 2);
6825
6826 bind(L_first_loop);
6827 subl(tmp1, 1);
6828 jccb(Assembler::negative, L_first_loop_exit);
6829
6830 subl(len, 4);
6831 subl(offset, 4);
6832
6833 Register op2 = tmp2;
6834 const Register sum = tmp3;
6835 const Register op1 = tmp4;
6836 const Register carry = tmp5;
6837
6838 if (UseBMI2Instructions) {
6839 op2 = rdxReg;
6840 }
6841
6842 movq(op1, Address(in, len, Address::times_4, 8));
6843 rorq(op1, 32);
6844 movq(sum, Address(out, offset, Address::times_4, 8));
6845 rorq(sum, 32);
6846 if (UseBMI2Instructions) {
6847 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6848 }
6849 else {
6850 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6851 }
6852 // Store back in big endian from little endian
6853 rorq(sum, 0x20);
6854 movq(Address(out, offset, Address::times_4, 8), sum);
6855
6856 movq(op1, Address(in, len, Address::times_4, 0));
6857 rorq(op1, 32);
6858 movq(sum, Address(out, offset, Address::times_4, 0));
6859 rorq(sum, 32);
6860 if (UseBMI2Instructions) {
6861 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6862 }
6863 else {
6864 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6865 }
6866 // Store back in big endian from little endian
6867 rorq(sum, 0x20);
6868 movq(Address(out, offset, Address::times_4, 0), sum);
6869
6870 jmp(L_first_loop);
6871 bind(L_first_loop_exit);
6872 }
6873
6874 /**
6875 * Code for BigInteger::mulAdd() intrinsic
6876 *
6877 * rdi: out
6878 * rsi: in
6879 * r11: offs (out.length - offset)
6880 * rcx: len
6881 * r8: k
6882 * r12: tmp1
6883 * r13: tmp2
6884 * r14: tmp3
6885 * r15: tmp4
6886 * rbx: tmp5
6887 * Multiply the in[] by word k and add to out[], return the carry in rax
6888 */
6889 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6890 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6891 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6892
6893 Label L_carry, L_last_in, L_done;
6894
6895 // carry = 0;
6896 // for (int j=len-1; j >= 0; j--) {
6897 // long product = (in[j] & LONG_MASK) * kLong +
6898 // (out[offs] & LONG_MASK) + carry;
6899 // out[offs--] = (int)product;
6900 // carry = product >>> 32;
6901 // }
6902 //
6903 push(tmp1);
6904 push(tmp2);
6905 push(tmp3);
6906 push(tmp4);
6907 push(tmp5);
6908
6909 Register op2 = tmp2;
6910 const Register sum = tmp3;
6911 const Register op1 = tmp4;
6912 const Register carry = tmp5;
6913
6914 if (UseBMI2Instructions) {
6915 op2 = rdxReg;
6916 movl(op2, k);
6917 }
6918 else {
6919 movl(op2, k);
6920 }
6921
6922 xorq(carry, carry);
6923
6924 //First loop
6925
6926 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
6927 //The carry is in tmp5
6928 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
6929
6930 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
6931 decrementl(len);
6932 jccb(Assembler::negative, L_carry);
6933 decrementl(len);
6934 jccb(Assembler::negative, L_last_in);
6935
6936 movq(op1, Address(in, len, Address::times_4, 0));
6937 rorq(op1, 32);
6938
6939 subl(offs, 2);
6940 movq(sum, Address(out, offs, Address::times_4, 0));
6941 rorq(sum, 32);
6942
6943 if (UseBMI2Instructions) {
6944 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6945 }
6946 else {
6947 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6948 }
6949
6950 // Store back in big endian from little endian
6951 rorq(sum, 0x20);
6952 movq(Address(out, offs, Address::times_4, 0), sum);
6953
6954 testl(len, len);
6955 jccb(Assembler::zero, L_carry);
6956
6957 //Multiply the last in[] entry, if any
6958 bind(L_last_in);
6959 movl(op1, Address(in, 0));
6960 movl(sum, Address(out, offs, Address::times_4, -4));
6961
6962 movl(raxReg, k);
6963 mull(op1); //tmp4 * eax -> edx:eax
6964 addl(sum, carry);
6965 adcl(rdxReg, 0);
6966 addl(sum, raxReg);
6967 adcl(rdxReg, 0);
6968 movl(carry, rdxReg);
6969
6970 movl(Address(out, offs, Address::times_4, -4), sum);
6971
6972 bind(L_carry);
6973 //return tmp5/carry as carry in rax
6974 movl(rax, carry);
6975
6976 bind(L_done);
6977 pop(tmp5);
6978 pop(tmp4);
6979 pop(tmp3);
6980 pop(tmp2);
6981 pop(tmp1);
6982 }
6983 #endif
6984
6985 /**
6986 * Emits code to update CRC-32 with a byte value according to constants in table
6987 *
6988 * @param [in,out]crc Register containing the crc.
6989 * @param [in]val Register containing the byte to fold into the CRC.
6990 * @param [in]table Register containing the table of crc constants.
6991 *
6992 * uint32_t crc;
6993 * val = crc_table[(val ^ crc) & 0xFF];
6994 * crc = val ^ (crc >> 8);
6995 *
6996 */
6997 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
6998 xorl(val, crc);
6999 andl(val, 0xFF);
7000 shrl(crc, 8); // unsigned shift
7001 xorl(crc, Address(table, val, Address::times_4, 0));
7002 }
7003
7004 /**
7005 * Fold 128-bit data chunk
7006 */
7007 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7008 if (UseAVX > 0) {
7009 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7010 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7011 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7012 pxor(xcrc, xtmp);
7013 } else {
7014 movdqa(xtmp, xcrc);
7015 pclmulhdq(xtmp, xK); // [123:64]
7016 pclmulldq(xcrc, xK); // [63:0]
7017 pxor(xcrc, xtmp);
7018 movdqu(xtmp, Address(buf, offset));
7019 pxor(xcrc, xtmp);
7020 }
7021 }
7022
7023 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7024 if (UseAVX > 0) {
7025 vpclmulhdq(xtmp, xK, xcrc);
7026 vpclmulldq(xcrc, xK, xcrc);
7027 pxor(xcrc, xbuf);
7028 pxor(xcrc, xtmp);
7029 } else {
7030 movdqa(xtmp, xcrc);
7031 pclmulhdq(xtmp, xK);
7032 pclmulldq(xcrc, xK);
7033 pxor(xcrc, xbuf);
7034 pxor(xcrc, xtmp);
7035 }
7036 }
7037
7038 /**
7039 * 8-bit folds to compute 32-bit CRC
7040 *
7041 * uint64_t xcrc;
7042 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7043 */
7044 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7045 movdl(tmp, xcrc);
7046 andl(tmp, 0xFF);
7047 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7048 psrldq(xcrc, 1); // unsigned shift one byte
7049 pxor(xcrc, xtmp);
7050 }
7051
7052 /**
7053 * uint32_t crc;
7054 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7055 */
7056 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7057 movl(tmp, crc);
7058 andl(tmp, 0xFF);
7059 shrl(crc, 8);
7060 xorl(crc, Address(table, tmp, Address::times_4, 0));
7061 }
7062
7063 /**
7064 * @param crc register containing existing CRC (32-bit)
7065 * @param buf register pointing to input byte buffer (byte*)
7066 * @param len register containing number of bytes
7067 * @param table register that will contain address of CRC table
7068 * @param tmp scratch register
7069 */
7070 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7071 assert_different_registers(crc, buf, len, table, tmp, rax);
7072
7073 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7074 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7075
7076 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7077 // context for the registers used, where all instructions below are using 128-bit mode
7078 // On EVEX without VL and BW, these instructions will all be AVX.
7079 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7080 notl(crc); // ~crc
7081 cmpl(len, 16);
7082 jcc(Assembler::less, L_tail);
7083
7084 // Align buffer to 16 bytes
7085 movl(tmp, buf);
7086 andl(tmp, 0xF);
7087 jccb(Assembler::zero, L_aligned);
7088 subl(tmp, 16);
7089 addl(len, tmp);
7090
7091 align(4);
7092 BIND(L_align_loop);
7093 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7094 update_byte_crc32(crc, rax, table);
7095 increment(buf);
7096 incrementl(tmp);
7097 jccb(Assembler::less, L_align_loop);
7098
7099 BIND(L_aligned);
7100 movl(tmp, len); // save
7101 shrl(len, 4);
7102 jcc(Assembler::zero, L_tail_restore);
7103
7104 // Fold crc into first bytes of vector
7105 movdqa(xmm1, Address(buf, 0));
7106 movdl(rax, xmm1);
7107 xorl(crc, rax);
7108 if (VM_Version::supports_sse4_1()) {
7109 pinsrd(xmm1, crc, 0);
7110 } else {
7111 pinsrw(xmm1, crc, 0);
7112 shrl(crc, 16);
7113 pinsrw(xmm1, crc, 1);
7114 }
7115 addptr(buf, 16);
7116 subl(len, 4); // len > 0
7117 jcc(Assembler::less, L_fold_tail);
7118
7119 movdqa(xmm2, Address(buf, 0));
7120 movdqa(xmm3, Address(buf, 16));
7121 movdqa(xmm4, Address(buf, 32));
7122 addptr(buf, 48);
7123 subl(len, 3);
7124 jcc(Assembler::lessEqual, L_fold_512b);
7125
7126 // Fold total 512 bits of polynomial on each iteration,
7127 // 128 bits per each of 4 parallel streams.
7128 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7129
7130 align32();
7131 BIND(L_fold_512b_loop);
7132 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7133 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7134 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7135 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7136 addptr(buf, 64);
7137 subl(len, 4);
7138 jcc(Assembler::greater, L_fold_512b_loop);
7139
7140 // Fold 512 bits to 128 bits.
7141 BIND(L_fold_512b);
7142 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7143 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7144 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7145 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7146
7147 // Fold the rest of 128 bits data chunks
7148 BIND(L_fold_tail);
7149 addl(len, 3);
7150 jccb(Assembler::lessEqual, L_fold_128b);
7151 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7152
7153 BIND(L_fold_tail_loop);
7154 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7155 addptr(buf, 16);
7156 decrementl(len);
7157 jccb(Assembler::greater, L_fold_tail_loop);
7158
7159 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7160 BIND(L_fold_128b);
7161 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7162 if (UseAVX > 0) {
7163 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7164 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7165 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7166 } else {
7167 movdqa(xmm2, xmm0);
7168 pclmulqdq(xmm2, xmm1, 0x1);
7169 movdqa(xmm3, xmm0);
7170 pand(xmm3, xmm2);
7171 pclmulqdq(xmm0, xmm3, 0x1);
7172 }
7173 psrldq(xmm1, 8);
7174 psrldq(xmm2, 4);
7175 pxor(xmm0, xmm1);
7176 pxor(xmm0, xmm2);
7177
7178 // 8 8-bit folds to compute 32-bit CRC.
7179 for (int j = 0; j < 4; j++) {
7180 fold_8bit_crc32(xmm0, table, xmm1, rax);
7181 }
7182 movdl(crc, xmm0); // mov 32 bits to general register
7183 for (int j = 0; j < 4; j++) {
7184 fold_8bit_crc32(crc, table, rax);
7185 }
7186
7187 BIND(L_tail_restore);
7188 movl(len, tmp); // restore
7189 BIND(L_tail);
7190 andl(len, 0xf);
7191 jccb(Assembler::zero, L_exit);
7192
7193 // Fold the rest of bytes
7194 align(4);
7195 BIND(L_tail_loop);
7196 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7197 update_byte_crc32(crc, rax, table);
7198 increment(buf);
7199 decrementl(len);
7200 jccb(Assembler::greater, L_tail_loop);
7201
7202 BIND(L_exit);
7203 notl(crc); // ~c
7204 }
7205
7206 #ifdef _LP64
7207 // Helper function for AVX 512 CRC32
7208 // Fold 512-bit data chunks
7209 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7210 Register pos, int offset) {
7211 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7212 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7213 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7214 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7215 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7216 }
7217
7218 // Helper function for AVX 512 CRC32
7219 // Compute CRC32 for < 256B buffers
7220 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7221 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7222 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7223
7224 Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7225 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7226 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7227
7228 // check if there is enough buffer to be able to fold 16B at a time
7229 cmpl(len, 32);
7230 jcc(Assembler::less, L_less_than_32);
7231
7232 // if there is, load the constants
7233 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
7234 movdl(xmm0, crc); // get the initial crc value
7235 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7236 pxor(xmm7, xmm0);
7237
7238 // update the buffer pointer
7239 addl(pos, 16);
7240 //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7241 subl(len, 32);
7242 jmp(L_16B_reduction_loop);
7243
7244 bind(L_less_than_32);
7245 //mov initial crc to the return value. this is necessary for zero - length buffers.
7246 movl(rax, crc);
7247 testl(len, len);
7248 jcc(Assembler::equal, L_cleanup);
7249
7250 movdl(xmm0, crc); //get the initial crc value
7251
7252 cmpl(len, 16);
7253 jcc(Assembler::equal, L_exact_16_left);
7254 jcc(Assembler::less, L_less_than_16_left);
7255
7256 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7257 pxor(xmm7, xmm0); //xor the initial crc value
7258 addl(pos, 16);
7259 subl(len, 16);
7260 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
7261 jmp(L_get_last_two_xmms);
7262
7263 bind(L_less_than_16_left);
7264 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7265 pxor(xmm1, xmm1);
7266 movptr(tmp1, rsp);
7267 movdqu(Address(tmp1, 0 * 16), xmm1);
7268
7269 cmpl(len, 4);
7270 jcc(Assembler::less, L_only_less_than_4);
7271
7272 //backup the counter value
7273 movl(tmp2, len);
7274 cmpl(len, 8);
7275 jcc(Assembler::less, L_less_than_8_left);
7276
7277 //load 8 Bytes
7278 movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7279 movq(Address(tmp1, 0 * 16), rax);
7280 addptr(tmp1, 8);
7281 subl(len, 8);
7282 addl(pos, 8);
7283
7284 bind(L_less_than_8_left);
7285 cmpl(len, 4);
7286 jcc(Assembler::less, L_less_than_4_left);
7287
7288 //load 4 Bytes
7289 movl(rax, Address(buf, pos, Address::times_1, 0));
7290 movl(Address(tmp1, 0 * 16), rax);
7291 addptr(tmp1, 4);
7292 subl(len, 4);
7293 addl(pos, 4);
7294
7295 bind(L_less_than_4_left);
7296 cmpl(len, 2);
7297 jcc(Assembler::less, L_less_than_2_left);
7298
7299 // load 2 Bytes
7300 movw(rax, Address(buf, pos, Address::times_1, 0));
7301 movl(Address(tmp1, 0 * 16), rax);
7302 addptr(tmp1, 2);
7303 subl(len, 2);
7304 addl(pos, 2);
7305
7306 bind(L_less_than_2_left);
7307 cmpl(len, 1);
7308 jcc(Assembler::less, L_zero_left);
7309
7310 // load 1 Byte
7311 movb(rax, Address(buf, pos, Address::times_1, 0));
7312 movb(Address(tmp1, 0 * 16), rax);
7313
7314 bind(L_zero_left);
7315 movdqu(xmm7, Address(rsp, 0));
7316 pxor(xmm7, xmm0); //xor the initial crc value
7317
7318 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7319 movdqu(xmm0, Address(rax, tmp2));
7320 pshufb(xmm7, xmm0);
7321 jmp(L_128_done);
7322
7323 bind(L_exact_16_left);
7324 movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7325 pxor(xmm7, xmm0); //xor the initial crc value
7326 jmp(L_128_done);
7327
7328 bind(L_only_less_than_4);
7329 cmpl(len, 3);
7330 jcc(Assembler::less, L_only_less_than_3);
7331
7332 // load 3 Bytes
7333 movb(rax, Address(buf, pos, Address::times_1, 0));
7334 movb(Address(tmp1, 0), rax);
7335
7336 movb(rax, Address(buf, pos, Address::times_1, 1));
7337 movb(Address(tmp1, 1), rax);
7338
7339 movb(rax, Address(buf, pos, Address::times_1, 2));
7340 movb(Address(tmp1, 2), rax);
7341
7342 movdqu(xmm7, Address(rsp, 0));
7343 pxor(xmm7, xmm0); //xor the initial crc value
7344
7345 pslldq(xmm7, 0x5);
7346 jmp(L_barrett);
7347 bind(L_only_less_than_3);
7348 cmpl(len, 2);
7349 jcc(Assembler::less, L_only_less_than_2);
7350
7351 // load 2 Bytes
7352 movb(rax, Address(buf, pos, Address::times_1, 0));
7353 movb(Address(tmp1, 0), rax);
7354
7355 movb(rax, Address(buf, pos, Address::times_1, 1));
7356 movb(Address(tmp1, 1), rax);
7357
7358 movdqu(xmm7, Address(rsp, 0));
7359 pxor(xmm7, xmm0); //xor the initial crc value
7360
7361 pslldq(xmm7, 0x6);
7362 jmp(L_barrett);
7363
7364 bind(L_only_less_than_2);
7365 //load 1 Byte
7366 movb(rax, Address(buf, pos, Address::times_1, 0));
7367 movb(Address(tmp1, 0), rax);
7368
7369 movdqu(xmm7, Address(rsp, 0));
7370 pxor(xmm7, xmm0); //xor the initial crc value
7371
7372 pslldq(xmm7, 0x7);
7373 }
7374
7375 /**
7376 * Compute CRC32 using AVX512 instructions
7377 * param crc register containing existing CRC (32-bit)
7378 * param buf register pointing to input byte buffer (byte*)
7379 * param len register containing number of bytes
7380 * param table address of crc or crc32c table
7381 * param tmp1 scratch register
7382 * param tmp2 scratch register
7383 * return rax result register
7384 *
7385 * This routine is identical for crc32c with the exception of the precomputed constant
7386 * table which will be passed as the table argument. The calculation steps are
7387 * the same for both variants.
7388 */
7389 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7390 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7391
7392 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7393 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7394 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7395 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7396 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7397
7398 const Register pos = r12;
7399 push(r12);
7400 subptr(rsp, 16 * 2 + 8);
7401
7402 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7403 // context for the registers used, where all instructions below are using 128-bit mode
7404 // On EVEX without VL and BW, these instructions will all be AVX.
7405 movl(pos, 0);
7406
7407 // check if smaller than 256B
7408 cmpl(len, 256);
7409 jcc(Assembler::less, L_less_than_256);
7410
7411 // load the initial crc value
7412 movdl(xmm10, crc);
7413
7414 // receive the initial 64B data, xor the initial crc value
7415 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7416 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7417 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7418 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7419
7420 subl(len, 256);
7421 cmpl(len, 256);
7422 jcc(Assembler::less, L_fold_128_B_loop);
7423
7424 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7425 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7426 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7427 subl(len, 256);
7428
7429 bind(L_fold_256_B_loop);
7430 addl(pos, 256);
7431 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7432 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7433 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7434 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7435
7436 subl(len, 256);
7437 jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7438
7439 // Fold 256 into 128
7440 addl(pos, 256);
7441 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7442 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7443 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7444
7445 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7446 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7447 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7448
7449 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7450 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7451
7452 addl(len, 128);
7453 jmp(L_fold_128_B_register);
7454
7455 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7456 // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7457
7458 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7459 bind(L_fold_128_B_loop);
7460 addl(pos, 128);
7461 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7462 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7463
7464 subl(len, 128);
7465 jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7466
7467 addl(pos, 128);
7468
7469 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7470 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7471 bind(L_fold_128_B_register);
7472 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7473 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7474 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7475 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7476 // save last that has no multiplicand
7477 vextracti64x2(xmm7, xmm4, 3);
7478
7479 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7480 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7481 // Needed later in reduction loop
7482 movdqu(xmm10, Address(table, 1 * 16));
7483 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7484 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7485
7486 // Swap 1,0,3,2 - 01 00 11 10
7487 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7488 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7489 vextracti128(xmm5, xmm8, 1);
7490 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7491
7492 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7493 // instead of a cmp instruction, we use the negative flag with the jl instruction
7494 addl(len, 128 - 16);
7495 jcc(Assembler::less, L_final_reduction_for_128);
7496
7497 bind(L_16B_reduction_loop);
7498 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7499 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7500 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7501 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7502 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7503 addl(pos, 16);
7504 subl(len, 16);
7505 jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7506
7507 bind(L_final_reduction_for_128);
7508 addl(len, 16);
7509 jcc(Assembler::equal, L_128_done);
7510
7511 bind(L_get_last_two_xmms);
7512 movdqu(xmm2, xmm7);
7513 addl(pos, len);
7514 movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7515 subl(pos, len);
7516
7517 // get rid of the extra data that was loaded before
7518 // load the shift constant
7519 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7520 movdqu(xmm0, Address(rax, len));
7521 addl(rax, len);
7522
7523 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7524 //Change mask to 512
7525 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7526 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7527
7528 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7529 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7530 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7531 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7532 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7533
7534 bind(L_128_done);
7535 // compute crc of a 128-bit value
7536 movdqu(xmm10, Address(table, 3 * 16));
7537 movdqu(xmm0, xmm7);
7538
7539 // 64b fold
7540 vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7541 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7542 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7543
7544 // 32b fold
7545 movdqu(xmm0, xmm7);
7546 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7547 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7548 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7549 jmp(L_barrett);
7550
7551 bind(L_less_than_256);
7552 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7553
7554 //barrett reduction
7555 bind(L_barrett);
7556 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7557 movdqu(xmm1, xmm7);
7558 movdqu(xmm2, xmm7);
7559 movdqu(xmm10, Address(table, 4 * 16));
7560
7561 pclmulqdq(xmm7, xmm10, 0x0);
7562 pxor(xmm7, xmm2);
7563 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7564 movdqu(xmm2, xmm7);
7565 pclmulqdq(xmm7, xmm10, 0x10);
7566 pxor(xmm7, xmm2);
7567 pxor(xmm7, xmm1);
7568 pextrd(crc, xmm7, 2);
7569
7570 bind(L_cleanup);
7571 addptr(rsp, 16 * 2 + 8);
7572 pop(r12);
7573 }
7574
7575 // S. Gueron / Information Processing Letters 112 (2012) 184
7576 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7577 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7578 // Output: the 64-bit carry-less product of B * CONST
7579 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7580 Register tmp1, Register tmp2, Register tmp3) {
7581 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7582 if (n > 0) {
7583 addq(tmp3, n * 256 * 8);
7584 }
7585 // Q1 = TABLEExt[n][B & 0xFF];
7586 movl(tmp1, in);
7587 andl(tmp1, 0x000000FF);
7588 shll(tmp1, 3);
7589 addq(tmp1, tmp3);
7590 movq(tmp1, Address(tmp1, 0));
7591
7592 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
7593 movl(tmp2, in);
7594 shrl(tmp2, 8);
7595 andl(tmp2, 0x000000FF);
7596 shll(tmp2, 3);
7597 addq(tmp2, tmp3);
7598 movq(tmp2, Address(tmp2, 0));
7599
7600 shlq(tmp2, 8);
7601 xorq(tmp1, tmp2);
7602
7603 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
7604 movl(tmp2, in);
7605 shrl(tmp2, 16);
7606 andl(tmp2, 0x000000FF);
7607 shll(tmp2, 3);
7608 addq(tmp2, tmp3);
7609 movq(tmp2, Address(tmp2, 0));
7610
7611 shlq(tmp2, 16);
7612 xorq(tmp1, tmp2);
7613
7614 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
7615 shrl(in, 24);
7616 andl(in, 0x000000FF);
7617 shll(in, 3);
7618 addq(in, tmp3);
7619 movq(in, Address(in, 0));
7620
7621 shlq(in, 24);
7622 xorq(in, tmp1);
7623 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7624 }
7625
7626 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7627 Register in_out,
7628 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7629 XMMRegister w_xtmp2,
7630 Register tmp1,
7631 Register n_tmp2, Register n_tmp3) {
7632 if (is_pclmulqdq_supported) {
7633 movdl(w_xtmp1, in_out); // modified blindly
7634
7635 movl(tmp1, const_or_pre_comp_const_index);
7636 movdl(w_xtmp2, tmp1);
7637 pclmulqdq(w_xtmp1, w_xtmp2, 0);
7638
7639 movdq(in_out, w_xtmp1);
7640 } else {
7641 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7642 }
7643 }
7644
7645 // Recombination Alternative 2: No bit-reflections
7646 // T1 = (CRC_A * U1) << 1
7647 // T2 = (CRC_B * U2) << 1
7648 // C1 = T1 >> 32
7649 // C2 = T2 >> 32
7650 // T1 = T1 & 0xFFFFFFFF
7651 // T2 = T2 & 0xFFFFFFFF
7652 // T1 = CRC32(0, T1)
7653 // T2 = CRC32(0, T2)
7654 // C1 = C1 ^ T1
7655 // C2 = C2 ^ T2
7656 // CRC = C1 ^ C2 ^ CRC_C
7657 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7658 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7659 Register tmp1, Register tmp2,
7660 Register n_tmp3) {
7661 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7662 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7663 shlq(in_out, 1);
7664 movl(tmp1, in_out);
7665 shrq(in_out, 32);
7666 xorl(tmp2, tmp2);
7667 crc32(tmp2, tmp1, 4);
7668 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7669 shlq(in1, 1);
7670 movl(tmp1, in1);
7671 shrq(in1, 32);
7672 xorl(tmp2, tmp2);
7673 crc32(tmp2, tmp1, 4);
7674 xorl(in1, tmp2);
7675 xorl(in_out, in1);
7676 xorl(in_out, in2);
7677 }
7678
7679 // Set N to predefined value
7680 // Subtract from a lenght of a buffer
7681 // execute in a loop:
7682 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7683 // for i = 1 to N do
7684 // CRC_A = CRC32(CRC_A, A[i])
7685 // CRC_B = CRC32(CRC_B, B[i])
7686 // CRC_C = CRC32(CRC_C, C[i])
7687 // end for
7688 // Recombine
7689 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7690 Register in_out1, Register in_out2, Register in_out3,
7691 Register tmp1, Register tmp2, Register tmp3,
7692 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7693 Register tmp4, Register tmp5,
7694 Register n_tmp6) {
7695 Label L_processPartitions;
7696 Label L_processPartition;
7697 Label L_exit;
7698
7699 bind(L_processPartitions);
7700 cmpl(in_out1, 3 * size);
7701 jcc(Assembler::less, L_exit);
7702 xorl(tmp1, tmp1);
7703 xorl(tmp2, tmp2);
7704 movq(tmp3, in_out2);
7705 addq(tmp3, size);
7706
7707 bind(L_processPartition);
7708 crc32(in_out3, Address(in_out2, 0), 8);
7709 crc32(tmp1, Address(in_out2, size), 8);
7710 crc32(tmp2, Address(in_out2, size * 2), 8);
7711 addq(in_out2, 8);
7712 cmpq(in_out2, tmp3);
7713 jcc(Assembler::less, L_processPartition);
7714 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7715 w_xtmp1, w_xtmp2, w_xtmp3,
7716 tmp4, tmp5,
7717 n_tmp6);
7718 addq(in_out2, 2 * size);
7719 subl(in_out1, 3 * size);
7720 jmp(L_processPartitions);
7721
7722 bind(L_exit);
7723 }
7724 #else
7725 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7726 Register tmp1, Register tmp2, Register tmp3,
7727 XMMRegister xtmp1, XMMRegister xtmp2) {
7728 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7729 if (n > 0) {
7730 addl(tmp3, n * 256 * 8);
7731 }
7732 // Q1 = TABLEExt[n][B & 0xFF];
7733 movl(tmp1, in_out);
7734 andl(tmp1, 0x000000FF);
7735 shll(tmp1, 3);
7736 addl(tmp1, tmp3);
7737 movq(xtmp1, Address(tmp1, 0));
7738
7739 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
7740 movl(tmp2, in_out);
7741 shrl(tmp2, 8);
7742 andl(tmp2, 0x000000FF);
7743 shll(tmp2, 3);
7744 addl(tmp2, tmp3);
7745 movq(xtmp2, Address(tmp2, 0));
7746
7747 psllq(xtmp2, 8);
7748 pxor(xtmp1, xtmp2);
7749
7750 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
7751 movl(tmp2, in_out);
7752 shrl(tmp2, 16);
7753 andl(tmp2, 0x000000FF);
7754 shll(tmp2, 3);
7755 addl(tmp2, tmp3);
7756 movq(xtmp2, Address(tmp2, 0));
7757
7758 psllq(xtmp2, 16);
7759 pxor(xtmp1, xtmp2);
7760
7761 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
7762 shrl(in_out, 24);
7763 andl(in_out, 0x000000FF);
7764 shll(in_out, 3);
7765 addl(in_out, tmp3);
7766 movq(xtmp2, Address(in_out, 0));
7767
7768 psllq(xtmp2, 24);
7769 pxor(xtmp1, xtmp2); // Result in CXMM
7770 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7771 }
7772
7773 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7774 Register in_out,
7775 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7776 XMMRegister w_xtmp2,
7777 Register tmp1,
7778 Register n_tmp2, Register n_tmp3) {
7779 if (is_pclmulqdq_supported) {
7780 movdl(w_xtmp1, in_out);
7781
7782 movl(tmp1, const_or_pre_comp_const_index);
7783 movdl(w_xtmp2, tmp1);
7784 pclmulqdq(w_xtmp1, w_xtmp2, 0);
7785 // Keep result in XMM since GPR is 32 bit in length
7786 } else {
7787 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7788 }
7789 }
7790
7791 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7792 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7793 Register tmp1, Register tmp2,
7794 Register n_tmp3) {
7795 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7796 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7797
7798 psllq(w_xtmp1, 1);
7799 movdl(tmp1, w_xtmp1);
7800 psrlq(w_xtmp1, 32);
7801 movdl(in_out, w_xtmp1);
7802
7803 xorl(tmp2, tmp2);
7804 crc32(tmp2, tmp1, 4);
7805 xorl(in_out, tmp2);
7806
7807 psllq(w_xtmp2, 1);
7808 movdl(tmp1, w_xtmp2);
7809 psrlq(w_xtmp2, 32);
7810 movdl(in1, w_xtmp2);
7811
7812 xorl(tmp2, tmp2);
7813 crc32(tmp2, tmp1, 4);
7814 xorl(in1, tmp2);
7815 xorl(in_out, in1);
7816 xorl(in_out, in2);
7817 }
7818
7819 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7820 Register in_out1, Register in_out2, Register in_out3,
7821 Register tmp1, Register tmp2, Register tmp3,
7822 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7823 Register tmp4, Register tmp5,
7824 Register n_tmp6) {
7825 Label L_processPartitions;
7826 Label L_processPartition;
7827 Label L_exit;
7828
7829 bind(L_processPartitions);
7830 cmpl(in_out1, 3 * size);
7831 jcc(Assembler::less, L_exit);
7832 xorl(tmp1, tmp1);
7833 xorl(tmp2, tmp2);
7834 movl(tmp3, in_out2);
7835 addl(tmp3, size);
7836
7837 bind(L_processPartition);
7838 crc32(in_out3, Address(in_out2, 0), 4);
7839 crc32(tmp1, Address(in_out2, size), 4);
7840 crc32(tmp2, Address(in_out2, size*2), 4);
7841 crc32(in_out3, Address(in_out2, 0+4), 4);
7842 crc32(tmp1, Address(in_out2, size+4), 4);
7843 crc32(tmp2, Address(in_out2, size*2+4), 4);
7844 addl(in_out2, 8);
7845 cmpl(in_out2, tmp3);
7846 jcc(Assembler::less, L_processPartition);
7847
7848 push(tmp3);
7849 push(in_out1);
7850 push(in_out2);
7851 tmp4 = tmp3;
7852 tmp5 = in_out1;
7853 n_tmp6 = in_out2;
7854
7855 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7856 w_xtmp1, w_xtmp2, w_xtmp3,
7857 tmp4, tmp5,
7858 n_tmp6);
7859
7860 pop(in_out2);
7861 pop(in_out1);
7862 pop(tmp3);
7863
7864 addl(in_out2, 2 * size);
7865 subl(in_out1, 3 * size);
7866 jmp(L_processPartitions);
7867
7868 bind(L_exit);
7869 }
7870 #endif //LP64
7871
7872 #ifdef _LP64
7873 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7874 // Input: A buffer I of L bytes.
7875 // Output: the CRC32C value of the buffer.
7876 // Notations:
7877 // Write L = 24N + r, with N = floor (L/24).
7878 // r = L mod 24 (0 <= r < 24).
7879 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7880 // N quadwords, and R consists of r bytes.
7881 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7882 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7883 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7884 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7885 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7886 Register tmp1, Register tmp2, Register tmp3,
7887 Register tmp4, Register tmp5, Register tmp6,
7888 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7889 bool is_pclmulqdq_supported) {
7890 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7891 Label L_wordByWord;
7892 Label L_byteByByteProlog;
7893 Label L_byteByByte;
7894 Label L_exit;
7895
7896 if (is_pclmulqdq_supported ) {
7897 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7898 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7899
7900 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7901 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7902
7903 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7904 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7905 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7906 } else {
7907 const_or_pre_comp_const_index[0] = 1;
7908 const_or_pre_comp_const_index[1] = 0;
7909
7910 const_or_pre_comp_const_index[2] = 3;
7911 const_or_pre_comp_const_index[3] = 2;
7912
7913 const_or_pre_comp_const_index[4] = 5;
7914 const_or_pre_comp_const_index[5] = 4;
7915 }
7916 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7917 in2, in1, in_out,
7918 tmp1, tmp2, tmp3,
7919 w_xtmp1, w_xtmp2, w_xtmp3,
7920 tmp4, tmp5,
7921 tmp6);
7922 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7923 in2, in1, in_out,
7924 tmp1, tmp2, tmp3,
7925 w_xtmp1, w_xtmp2, w_xtmp3,
7926 tmp4, tmp5,
7927 tmp6);
7928 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7929 in2, in1, in_out,
7930 tmp1, tmp2, tmp3,
7931 w_xtmp1, w_xtmp2, w_xtmp3,
7932 tmp4, tmp5,
7933 tmp6);
7934 movl(tmp1, in2);
7935 andl(tmp1, 0x00000007);
7936 negl(tmp1);
7937 addl(tmp1, in2);
7938 addq(tmp1, in1);
7939
7940 cmpq(in1, tmp1);
7941 jccb(Assembler::greaterEqual, L_byteByByteProlog);
7942 align(16);
7943 BIND(L_wordByWord);
7944 crc32(in_out, Address(in1, 0), 8);
7945 addq(in1, 8);
7946 cmpq(in1, tmp1);
7947 jcc(Assembler::less, L_wordByWord);
7948
7949 BIND(L_byteByByteProlog);
7950 andl(in2, 0x00000007);
7951 movl(tmp2, 1);
7952
7953 cmpl(tmp2, in2);
7954 jccb(Assembler::greater, L_exit);
7955 BIND(L_byteByByte);
7956 crc32(in_out, Address(in1, 0), 1);
7957 incq(in1);
7958 incl(tmp2);
7959 cmpl(tmp2, in2);
7960 jcc(Assembler::lessEqual, L_byteByByte);
7961
7962 BIND(L_exit);
7963 }
7964 #else
7965 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7966 Register tmp1, Register tmp2, Register tmp3,
7967 Register tmp4, Register tmp5, Register tmp6,
7968 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7969 bool is_pclmulqdq_supported) {
7970 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7971 Label L_wordByWord;
7972 Label L_byteByByteProlog;
7973 Label L_byteByByte;
7974 Label L_exit;
7975
7976 if (is_pclmulqdq_supported) {
7977 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7978 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
7979
7980 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7981 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7982
7983 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7984 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7985 } else {
7986 const_or_pre_comp_const_index[0] = 1;
7987 const_or_pre_comp_const_index[1] = 0;
7988
7989 const_or_pre_comp_const_index[2] = 3;
7990 const_or_pre_comp_const_index[3] = 2;
7991
7992 const_or_pre_comp_const_index[4] = 5;
7993 const_or_pre_comp_const_index[5] = 4;
7994 }
7995 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7996 in2, in1, in_out,
7997 tmp1, tmp2, tmp3,
7998 w_xtmp1, w_xtmp2, w_xtmp3,
7999 tmp4, tmp5,
8000 tmp6);
8001 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8002 in2, in1, in_out,
8003 tmp1, tmp2, tmp3,
8004 w_xtmp1, w_xtmp2, w_xtmp3,
8005 tmp4, tmp5,
8006 tmp6);
8007 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8008 in2, in1, in_out,
8009 tmp1, tmp2, tmp3,
8010 w_xtmp1, w_xtmp2, w_xtmp3,
8011 tmp4, tmp5,
8012 tmp6);
8013 movl(tmp1, in2);
8014 andl(tmp1, 0x00000007);
8015 negl(tmp1);
8016 addl(tmp1, in2);
8017 addl(tmp1, in1);
8018
8019 BIND(L_wordByWord);
8020 cmpl(in1, tmp1);
8021 jcc(Assembler::greaterEqual, L_byteByByteProlog);
8022 crc32(in_out, Address(in1,0), 4);
8023 addl(in1, 4);
8024 jmp(L_wordByWord);
8025
8026 BIND(L_byteByByteProlog);
8027 andl(in2, 0x00000007);
8028 movl(tmp2, 1);
8029
8030 BIND(L_byteByByte);
8031 cmpl(tmp2, in2);
8032 jccb(Assembler::greater, L_exit);
8033 movb(tmp1, Address(in1, 0));
8034 crc32(in_out, tmp1, 1);
8035 incl(in1);
8036 incl(tmp2);
8037 jmp(L_byteByByte);
8038
8039 BIND(L_exit);
8040 }
8041 #endif // LP64
8042 #undef BIND
8043 #undef BLOCK_COMMENT
8044
8045 // Compress char[] array to byte[].
8046 // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8047 // @IntrinsicCandidate
8048 // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8049 // for (int i = 0; i < len; i++) {
8050 // int c = src[srcOff++];
8051 // if (c >>> 8 != 0) {
8052 // return 0;
8053 // }
8054 // dst[dstOff++] = (byte)c;
8055 // }
8056 // return len;
8057 // }
8058 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8059 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8060 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8061 Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8062 Label copy_chars_loop, return_length, return_zero, done;
8063
8064 // rsi: src
8065 // rdi: dst
8066 // rdx: len
8067 // rcx: tmp5
8068 // rax: result
8069
8070 // rsi holds start addr of source char[] to be compressed
8071 // rdi holds start addr of destination byte[]
8072 // rdx holds length
8073
8074 assert(len != result, "");
8075
8076 // save length for return
8077 push(len);
8078
8079 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8080 VM_Version::supports_avx512vlbw() &&
8081 VM_Version::supports_bmi2()) {
8082
8083 Label copy_32_loop, copy_loop_tail, below_threshold;
8084
8085 // alignment
8086 Label post_alignment;
8087
8088 // if length of the string is less than 16, handle it in an old fashioned way
8089 testl(len, -32);
8090 jcc(Assembler::zero, below_threshold);
8091
8092 // First check whether a character is compressable ( <= 0xFF).
8093 // Create mask to test for Unicode chars inside zmm vector
8094 movl(result, 0x00FF);
8095 evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8096
8097 testl(len, -64);
8098 jcc(Assembler::zero, post_alignment);
8099
8100 movl(tmp5, dst);
8101 andl(tmp5, (32 - 1));
8102 negl(tmp5);
8103 andl(tmp5, (32 - 1));
8104
8105 // bail out when there is nothing to be done
8106 testl(tmp5, 0xFFFFFFFF);
8107 jcc(Assembler::zero, post_alignment);
8108
8109 // ~(~0 << len), where len is the # of remaining elements to process
8110 movl(result, 0xFFFFFFFF);
8111 shlxl(result, result, tmp5);
8112 notl(result);
8113 kmovdl(mask2, result);
8114
8115 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8116 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8117 ktestd(mask1, mask2);
8118 jcc(Assembler::carryClear, return_zero);
8119
8120 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8121
8122 addptr(src, tmp5);
8123 addptr(src, tmp5);
8124 addptr(dst, tmp5);
8125 subl(len, tmp5);
8126
8127 bind(post_alignment);
8128 // end of alignment
8129
8130 movl(tmp5, len);
8131 andl(tmp5, (32 - 1)); // tail count (in chars)
8132 andl(len, ~(32 - 1)); // vector count (in chars)
8133 jcc(Assembler::zero, copy_loop_tail);
8134
8135 lea(src, Address(src, len, Address::times_2));
8136 lea(dst, Address(dst, len, Address::times_1));
8137 negptr(len);
8138
8139 bind(copy_32_loop);
8140 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit);
8141 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8142 kortestdl(mask1, mask1);
8143 jcc(Assembler::carryClear, return_zero);
8144
8145 // All elements in current processed chunk are valid candidates for
8146 // compression. Write a truncated byte elements to the memory.
8147 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8148 addptr(len, 32);
8149 jcc(Assembler::notZero, copy_32_loop);
8150
8151 bind(copy_loop_tail);
8152 // bail out when there is nothing to be done
8153 testl(tmp5, 0xFFFFFFFF);
8154 jcc(Assembler::zero, return_length);
8155
8156 movl(len, tmp5);
8157
8158 // ~(~0 << len), where len is the # of remaining elements to process
8159 movl(result, 0xFFFFFFFF);
8160 shlxl(result, result, len);
8161 notl(result);
8162
8163 kmovdl(mask2, result);
8164
8165 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8166 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8167 ktestd(mask1, mask2);
8168 jcc(Assembler::carryClear, return_zero);
8169
8170 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8171 jmp(return_length);
8172
8173 bind(below_threshold);
8174 }
8175
8176 if (UseSSE42Intrinsics) {
8177 Label copy_32_loop, copy_16, copy_tail;
8178
8179 movl(result, len);
8180
8181 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
8182
8183 // vectored compression
8184 andl(len, 0xfffffff0); // vector count (in chars)
8185 andl(result, 0x0000000f); // tail count (in chars)
8186 testl(len, len);
8187 jcc(Assembler::zero, copy_16);
8188
8189 // compress 16 chars per iter
8190 movdl(tmp1Reg, tmp5);
8191 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
8192 pxor(tmp4Reg, tmp4Reg);
8193
8194 lea(src, Address(src, len, Address::times_2));
8195 lea(dst, Address(dst, len, Address::times_1));
8196 negptr(len);
8197
8198 bind(copy_32_loop);
8199 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
8200 por(tmp4Reg, tmp2Reg);
8201 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8202 por(tmp4Reg, tmp3Reg);
8203 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
8204 jcc(Assembler::notZero, return_zero);
8205 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
8206 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8207 addptr(len, 16);
8208 jcc(Assembler::notZero, copy_32_loop);
8209
8210 // compress next vector of 8 chars (if any)
8211 bind(copy_16);
8212 movl(len, result);
8213 andl(len, 0xfffffff8); // vector count (in chars)
8214 andl(result, 0x00000007); // tail count (in chars)
8215 testl(len, len);
8216 jccb(Assembler::zero, copy_tail);
8217
8218 movdl(tmp1Reg, tmp5);
8219 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
8220 pxor(tmp3Reg, tmp3Reg);
8221
8222 movdqu(tmp2Reg, Address(src, 0));
8223 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8224 jccb(Assembler::notZero, return_zero);
8225 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
8226 movq(Address(dst, 0), tmp2Reg);
8227 addptr(src, 16);
8228 addptr(dst, 8);
8229
8230 bind(copy_tail);
8231 movl(len, result);
8232 }
8233 // compress 1 char per iter
8234 testl(len, len);
8235 jccb(Assembler::zero, return_length);
8236 lea(src, Address(src, len, Address::times_2));
8237 lea(dst, Address(dst, len, Address::times_1));
8238 negptr(len);
8239
8240 bind(copy_chars_loop);
8241 load_unsigned_short(result, Address(src, len, Address::times_2));
8242 testl(result, 0xff00); // check if Unicode char
8243 jccb(Assembler::notZero, return_zero);
8244 movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte
8245 increment(len);
8246 jcc(Assembler::notZero, copy_chars_loop);
8247
8248 // if compression succeeded, return length
8249 bind(return_length);
8250 pop(result);
8251 jmpb(done);
8252
8253 // if compression failed, return 0
8254 bind(return_zero);
8255 xorl(result, result);
8256 addptr(rsp, wordSize);
8257
8258 bind(done);
8259 }
8260
8261 // Inflate byte[] array to char[].
8262 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8263 // @IntrinsicCandidate
8264 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8265 // for (int i = 0; i < len; i++) {
8266 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8267 // }
8268 // }
8269 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8270 XMMRegister tmp1, Register tmp2, KRegister mask) {
8271 Label copy_chars_loop, done, below_threshold, avx3_threshold;
8272 // rsi: src
8273 // rdi: dst
8274 // rdx: len
8275 // rcx: tmp2
8276
8277 // rsi holds start addr of source byte[] to be inflated
8278 // rdi holds start addr of destination char[]
8279 // rdx holds length
8280 assert_different_registers(src, dst, len, tmp2);
8281 movl(tmp2, len);
8282 if ((UseAVX > 2) && // AVX512
8283 VM_Version::supports_avx512vlbw() &&
8284 VM_Version::supports_bmi2()) {
8285
8286 Label copy_32_loop, copy_tail;
8287 Register tmp3_aliased = len;
8288
8289 // if length of the string is less than 16, handle it in an old fashioned way
8290 testl(len, -16);
8291 jcc(Assembler::zero, below_threshold);
8292
8293 testl(len, -1 * AVX3Threshold);
8294 jcc(Assembler::zero, avx3_threshold);
8295
8296 // In order to use only one arithmetic operation for the main loop we use
8297 // this pre-calculation
8298 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8299 andl(len, -32); // vector count
8300 jccb(Assembler::zero, copy_tail);
8301
8302 lea(src, Address(src, len, Address::times_1));
8303 lea(dst, Address(dst, len, Address::times_2));
8304 negptr(len);
8305
8306
8307 // inflate 32 chars per iter
8308 bind(copy_32_loop);
8309 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8310 evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit);
8311 addptr(len, 32);
8312 jcc(Assembler::notZero, copy_32_loop);
8313
8314 bind(copy_tail);
8315 // bail out when there is nothing to be done
8316 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8317 jcc(Assembler::zero, done);
8318
8319 // ~(~0 << length), where length is the # of remaining elements to process
8320 movl(tmp3_aliased, -1);
8321 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8322 notl(tmp3_aliased);
8323 kmovdl(mask, tmp3_aliased);
8324 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8325 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8326
8327 jmp(done);
8328 bind(avx3_threshold);
8329 }
8330 if (UseSSE42Intrinsics) {
8331 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8332
8333 if (UseAVX > 1) {
8334 andl(tmp2, (16 - 1));
8335 andl(len, -16);
8336 jccb(Assembler::zero, copy_new_tail);
8337 } else {
8338 andl(tmp2, 0x00000007); // tail count (in chars)
8339 andl(len, 0xfffffff8); // vector count (in chars)
8340 jccb(Assembler::zero, copy_tail);
8341 }
8342
8343 // vectored inflation
8344 lea(src, Address(src, len, Address::times_1));
8345 lea(dst, Address(dst, len, Address::times_2));
8346 negptr(len);
8347
8348 if (UseAVX > 1) {
8349 bind(copy_16_loop);
8350 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8351 vmovdqu(Address(dst, len, Address::times_2), tmp1);
8352 addptr(len, 16);
8353 jcc(Assembler::notZero, copy_16_loop);
8354
8355 bind(below_threshold);
8356 bind(copy_new_tail);
8357 movl(len, tmp2);
8358 andl(tmp2, 0x00000007);
8359 andl(len, 0xFFFFFFF8);
8360 jccb(Assembler::zero, copy_tail);
8361
8362 pmovzxbw(tmp1, Address(src, 0));
8363 movdqu(Address(dst, 0), tmp1);
8364 addptr(src, 8);
8365 addptr(dst, 2 * 8);
8366
8367 jmp(copy_tail, true);
8368 }
8369
8370 // inflate 8 chars per iter
8371 bind(copy_8_loop);
8372 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
8373 movdqu(Address(dst, len, Address::times_2), tmp1);
8374 addptr(len, 8);
8375 jcc(Assembler::notZero, copy_8_loop);
8376
8377 bind(copy_tail);
8378 movl(len, tmp2);
8379
8380 cmpl(len, 4);
8381 jccb(Assembler::less, copy_bytes);
8382
8383 movdl(tmp1, Address(src, 0)); // load 4 byte chars
8384 pmovzxbw(tmp1, tmp1);
8385 movq(Address(dst, 0), tmp1);
8386 subptr(len, 4);
8387 addptr(src, 4);
8388 addptr(dst, 8);
8389
8390 bind(copy_bytes);
8391 } else {
8392 bind(below_threshold);
8393 }
8394
8395 testl(len, len);
8396 jccb(Assembler::zero, done);
8397 lea(src, Address(src, len, Address::times_1));
8398 lea(dst, Address(dst, len, Address::times_2));
8399 negptr(len);
8400
8401 // inflate 1 char per iter
8402 bind(copy_chars_loop);
8403 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
8404 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
8405 increment(len);
8406 jcc(Assembler::notZero, copy_chars_loop);
8407
8408 bind(done);
8409 }
8410
8411
8412 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
8413 switch(type) {
8414 case T_BYTE:
8415 case T_BOOLEAN:
8416 evmovdqub(dst, kmask, src, false, vector_len);
8417 break;
8418 case T_CHAR:
8419 case T_SHORT:
8420 evmovdquw(dst, kmask, src, false, vector_len);
8421 break;
8422 case T_INT:
8423 case T_FLOAT:
8424 evmovdqul(dst, kmask, src, false, vector_len);
8425 break;
8426 case T_LONG:
8427 case T_DOUBLE:
8428 evmovdquq(dst, kmask, src, false, vector_len);
8429 break;
8430 default:
8431 fatal("Unexpected type argument %s", type2name(type));
8432 break;
8433 }
8434 }
8435
8436 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
8437 switch(type) {
8438 case T_BYTE:
8439 case T_BOOLEAN:
8440 evmovdqub(dst, kmask, src, true, vector_len);
8441 break;
8442 case T_CHAR:
8443 case T_SHORT:
8444 evmovdquw(dst, kmask, src, true, vector_len);
8445 break;
8446 case T_INT:
8447 case T_FLOAT:
8448 evmovdqul(dst, kmask, src, true, vector_len);
8449 break;
8450 case T_LONG:
8451 case T_DOUBLE:
8452 evmovdquq(dst, kmask, src, true, vector_len);
8453 break;
8454 default:
8455 fatal("Unexpected type argument %s", type2name(type));
8456 break;
8457 }
8458 }
8459
8460 #if COMPILER2_OR_JVMCI
8461
8462
8463 // Set memory operation for length "less than" 64 bytes.
8464 void MacroAssembler::fill64_masked_avx(uint shift, Register dst, int disp,
8465 XMMRegister xmm, KRegister mask, Register length,
8466 Register temp, bool use64byteVector) {
8467 assert(MaxVectorSize >= 32, "vector length should be >= 32");
8468 assert(shift != 0, "shift value should be 1 (short),2(int) or 3(long)");
8469 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8470 if (!use64byteVector) {
8471 fill32_avx(dst, disp, xmm);
8472 subptr(length, 32 >> shift);
8473 fill32_masked_avx(shift, dst, disp + 32, xmm, mask, length, temp);
8474 } else {
8475 assert(MaxVectorSize == 64, "vector length != 64");
8476 movl(temp, 1);
8477 shlxl(temp, temp, length);
8478 subptr(temp, 1);
8479 kmovwl(mask, temp);
8480 evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_512bit);
8481 }
8482 }
8483
8484
8485 void MacroAssembler::fill32_masked_avx(uint shift, Register dst, int disp,
8486 XMMRegister xmm, KRegister mask, Register length,
8487 Register temp) {
8488 assert(MaxVectorSize >= 32, "vector length should be >= 32");
8489 assert(shift != 0, "shift value should be 1 (short), 2(int) or 3(long)");
8490 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8491 movl(temp, 1);
8492 shlxl(temp, temp, length);
8493 subptr(temp, 1);
8494 kmovwl(mask, temp);
8495 evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_256bit);
8496 }
8497
8498 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
8499 assert(MaxVectorSize >= 32, "vector length should be >= 32");
8500 vmovdqu(dst, xmm);
8501 }
8502
8503 void MacroAssembler::fill32_avx(Register dst, int disp, XMMRegister xmm) {
8504 fill32(Address(dst, disp), xmm);
8505 }
8506
8507 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
8508 assert(MaxVectorSize >= 32, "vector length should be >= 32");
8509 if (!use64byteVector) {
8510 fill32(dst, xmm);
8511 fill32(dst.plus_disp(32), xmm);
8512 } else {
8513 evmovdquq(dst, xmm, Assembler::AVX_512bit);
8514 }
8515 }
8516
8517 void MacroAssembler::fill64_avx(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
8518 fill64(Address(dst, disp), xmm, use64byteVector);
8519 }
8520
8521 #endif //COMPILER2_OR_JVMCI
8522
8523
8524 #ifdef _LP64
8525 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
8526 Label done;
8527 cvttss2sil(dst, src);
8528 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8529 cmpl(dst, 0x80000000); // float_sign_flip
8530 jccb(Assembler::notEqual, done);
8531 subptr(rsp, 8);
8532 movflt(Address(rsp, 0), src);
8533 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
8534 pop(dst);
8535 bind(done);
8536 }
8537
8538 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
8539 Label done;
8540 cvttsd2sil(dst, src);
8541 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8542 cmpl(dst, 0x80000000); // float_sign_flip
8543 jccb(Assembler::notEqual, done);
8544 subptr(rsp, 8);
8545 movdbl(Address(rsp, 0), src);
8546 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
8547 pop(dst);
8548 bind(done);
8549 }
8550
8551 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
8552 Label done;
8553 cvttss2siq(dst, src);
8554 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8555 jccb(Assembler::notEqual, done);
8556 subptr(rsp, 8);
8557 movflt(Address(rsp, 0), src);
8558 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
8559 pop(dst);
8560 bind(done);
8561 }
8562
8563 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
8564 Label done;
8565 cvttsd2siq(dst, src);
8566 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8567 jccb(Assembler::notEqual, done);
8568 subptr(rsp, 8);
8569 movdbl(Address(rsp, 0), src);
8570 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
8571 pop(dst);
8572 bind(done);
8573 }
8574
8575 void MacroAssembler::cache_wb(Address line)
8576 {
8577 // 64 bit cpus always support clflush
8578 assert(VM_Version::supports_clflush(), "clflush should be available");
8579 bool optimized = VM_Version::supports_clflushopt();
8580 bool no_evict = VM_Version::supports_clwb();
8581
8582 // prefer clwb (writeback without evict) otherwise
8583 // prefer clflushopt (potentially parallel writeback with evict)
8584 // otherwise fallback on clflush (serial writeback with evict)
8585
8586 if (optimized) {
8587 if (no_evict) {
8588 clwb(line);
8589 } else {
8590 clflushopt(line);
8591 }
8592 } else {
8593 // no need for fence when using CLFLUSH
8594 clflush(line);
8595 }
8596 }
8597
8598 void MacroAssembler::cache_wbsync(bool is_pre)
8599 {
8600 assert(VM_Version::supports_clflush(), "clflush should be available");
8601 bool optimized = VM_Version::supports_clflushopt();
8602 bool no_evict = VM_Version::supports_clwb();
8603
8604 // pick the correct implementation
8605
8606 if (!is_pre && (optimized || no_evict)) {
8607 // need an sfence for post flush when using clflushopt or clwb
8608 // otherwise no no need for any synchroniaztion
8609
8610 sfence();
8611 }
8612 }
8613
8614 #endif // _LP64
8615
8616 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
8617 switch (cond) {
8618 // Note some conditions are synonyms for others
8619 case Assembler::zero: return Assembler::notZero;
8620 case Assembler::notZero: return Assembler::zero;
8621 case Assembler::less: return Assembler::greaterEqual;
8622 case Assembler::lessEqual: return Assembler::greater;
8623 case Assembler::greater: return Assembler::lessEqual;
8624 case Assembler::greaterEqual: return Assembler::less;
8625 case Assembler::below: return Assembler::aboveEqual;
8626 case Assembler::belowEqual: return Assembler::above;
8627 case Assembler::above: return Assembler::belowEqual;
8628 case Assembler::aboveEqual: return Assembler::below;
8629 case Assembler::overflow: return Assembler::noOverflow;
8630 case Assembler::noOverflow: return Assembler::overflow;
8631 case Assembler::negative: return Assembler::positive;
8632 case Assembler::positive: return Assembler::negative;
8633 case Assembler::parity: return Assembler::noParity;
8634 case Assembler::noParity: return Assembler::parity;
8635 }
8636 ShouldNotReachHere(); return Assembler::overflow;
8637 }
8638
8639 SkipIfEqual::SkipIfEqual(
8640 MacroAssembler* masm, const bool* flag_addr, bool value) {
8641 _masm = masm;
8642 _masm->cmp8(ExternalAddress((address)flag_addr), value);
8643 _masm->jcc(Assembler::equal, _label);
8644 }
8645
8646 SkipIfEqual::~SkipIfEqual() {
8647 _masm->bind(_label);
8648 }
8649
8650 // 32-bit Windows has its own fast-path implementation
8651 // of get_thread
8652 #if !defined(WIN32) || defined(_LP64)
8653
8654 // This is simply a call to Thread::current()
8655 void MacroAssembler::get_thread(Register thread) {
8656 if (thread != rax) {
8657 push(rax);
8658 }
8659 LP64_ONLY(push(rdi);)
8660 LP64_ONLY(push(rsi);)
8661 push(rdx);
8662 push(rcx);
8663 #ifdef _LP64
8664 push(r8);
8665 push(r9);
8666 push(r10);
8667 push(r11);
8668 #endif
8669
8670 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
8671
8672 #ifdef _LP64
8673 pop(r11);
8674 pop(r10);
8675 pop(r9);
8676 pop(r8);
8677 #endif
8678 pop(rcx);
8679 pop(rdx);
8680 LP64_ONLY(pop(rsi);)
8681 LP64_ONLY(pop(rdi);)
8682 if (thread != rax) {
8683 mov(thread, rax);
8684 pop(rax);
8685 }
8686 }
8687
8688 #endif // !WIN32 || _LP64