1 /*
2 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "jvm.h"
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "compiler/compiler_globals.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/collectedHeap.inline.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/bytecodeHistogram.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "memory/resourceArea.hpp"
38 #include "memory/universe.hpp"
39 #include "oops/accessDecorators.hpp"
40 #include "oops/compressedOops.inline.hpp"
41 #include "oops/klass.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "runtime/biasedLocking.hpp"
44 #include "runtime/flags/flagSetting.hpp"
45 #include "runtime/interfaceSupport.inline.hpp"
46 #include "runtime/jniHandles.hpp"
47 #include "runtime/objectMonitor.hpp"
48 #include "runtime/os.hpp"
49 #include "runtime/safepoint.hpp"
50 #include "runtime/safepointMechanism.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubRoutines.hpp"
53 #include "runtime/thread.hpp"
54 #include "utilities/macros.hpp"
55 #include "crc32c.h"
56
57 #ifdef COMPILER2
58 #include "opto/c2_CodeStubs.hpp"
59 #include "opto/compile.hpp"
60 #include "opto/output.hpp"
61 #endif
62
63 #ifdef PRODUCT
64 #define BLOCK_COMMENT(str) /* nothing */
65 #define STOP(error) stop(error)
66 #else
67 #define BLOCK_COMMENT(str) block_comment(str)
68 #define STOP(error) block_comment(error); stop(error)
69 #endif
70
71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
72
73 #ifdef ASSERT
74 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
75 #endif
76
77 static Assembler::Condition reverse[] = {
78 Assembler::noOverflow /* overflow = 0x0 */ ,
79 Assembler::overflow /* noOverflow = 0x1 */ ,
80 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
81 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
82 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
83 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
84 Assembler::above /* belowEqual = 0x6 */ ,
85 Assembler::belowEqual /* above = 0x7 */ ,
86 Assembler::positive /* negative = 0x8 */ ,
87 Assembler::negative /* positive = 0x9 */ ,
88 Assembler::noParity /* parity = 0xa */ ,
89 Assembler::parity /* noParity = 0xb */ ,
90 Assembler::greaterEqual /* less = 0xc */ ,
91 Assembler::less /* greaterEqual = 0xd */ ,
92 Assembler::greater /* lessEqual = 0xe */ ,
93 Assembler::lessEqual /* greater = 0xf, */
94
95 };
96
97
98 // Implementation of MacroAssembler
99
100 // First all the versions that have distinct versions depending on 32/64 bit
101 // Unless the difference is trivial (1 line or so).
102
103 #ifndef _LP64
104
105 // 32bit versions
106
107 Address MacroAssembler::as_Address(AddressLiteral adr) {
108 return Address(adr.target(), adr.rspec());
109 }
110
111 Address MacroAssembler::as_Address(ArrayAddress adr) {
112 return Address::make_array(adr);
113 }
114
115 void MacroAssembler::call_VM_leaf_base(address entry_point,
116 int number_of_arguments) {
117 call(RuntimeAddress(entry_point));
118 increment(rsp, number_of_arguments * wordSize);
119 }
120
121 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
122 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
123 }
124
125
126 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
127 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
128 }
129
130 void MacroAssembler::cmpoop(Address src1, jobject obj) {
131 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
132 }
133
134 void MacroAssembler::cmpoop(Register src1, jobject obj) {
135 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
136 }
137
138 void MacroAssembler::extend_sign(Register hi, Register lo) {
139 // According to Intel Doc. AP-526, "Integer Divide", p.18.
140 if (VM_Version::is_P6() && hi == rdx && lo == rax) {
141 cdql();
142 } else {
143 movl(hi, lo);
144 sarl(hi, 31);
145 }
146 }
147
148 void MacroAssembler::jC2(Register tmp, Label& L) {
149 // set parity bit if FPU flag C2 is set (via rax)
150 save_rax(tmp);
151 fwait(); fnstsw_ax();
152 sahf();
153 restore_rax(tmp);
154 // branch
155 jcc(Assembler::parity, L);
156 }
157
158 void MacroAssembler::jnC2(Register tmp, Label& L) {
159 // set parity bit if FPU flag C2 is set (via rax)
160 save_rax(tmp);
161 fwait(); fnstsw_ax();
162 sahf();
163 restore_rax(tmp);
164 // branch
165 jcc(Assembler::noParity, L);
166 }
167
168 // 32bit can do a case table jump in one instruction but we no longer allow the base
169 // to be installed in the Address class
170 void MacroAssembler::jump(ArrayAddress entry) {
171 jmp(as_Address(entry));
172 }
173
174 // Note: y_lo will be destroyed
175 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
176 // Long compare for Java (semantics as described in JVM spec.)
177 Label high, low, done;
178
179 cmpl(x_hi, y_hi);
180 jcc(Assembler::less, low);
181 jcc(Assembler::greater, high);
182 // x_hi is the return register
183 xorl(x_hi, x_hi);
184 cmpl(x_lo, y_lo);
185 jcc(Assembler::below, low);
186 jcc(Assembler::equal, done);
187
188 bind(high);
189 xorl(x_hi, x_hi);
190 increment(x_hi);
191 jmp(done);
192
193 bind(low);
194 xorl(x_hi, x_hi);
195 decrementl(x_hi);
196
197 bind(done);
198 }
199
200 void MacroAssembler::lea(Register dst, AddressLiteral src) {
201 mov_literal32(dst, (int32_t)src.target(), src.rspec());
202 }
203
204 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
205 // leal(dst, as_Address(adr));
206 // see note in movl as to why we must use a move
207 mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
208 }
209
210 void MacroAssembler::leave() {
211 mov(rsp, rbp);
212 pop(rbp);
213 }
214
215 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
216 // Multiplication of two Java long values stored on the stack
217 // as illustrated below. Result is in rdx:rax.
218 //
219 // rsp ---> [ ?? ] \ \
220 // .... | y_rsp_offset |
221 // [ y_lo ] / (in bytes) | x_rsp_offset
222 // [ y_hi ] | (in bytes)
223 // .... |
224 // [ x_lo ] /
225 // [ x_hi ]
226 // ....
227 //
228 // Basic idea: lo(result) = lo(x_lo * y_lo)
229 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
230 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
231 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
232 Label quick;
233 // load x_hi, y_hi and check if quick
234 // multiplication is possible
235 movl(rbx, x_hi);
236 movl(rcx, y_hi);
237 movl(rax, rbx);
238 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0
239 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply
240 // do full multiplication
241 // 1st step
242 mull(y_lo); // x_hi * y_lo
243 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx,
244 // 2nd step
245 movl(rax, x_lo);
246 mull(rcx); // x_lo * y_hi
247 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx,
248 // 3rd step
249 bind(quick); // note: rbx, = 0 if quick multiply!
250 movl(rax, x_lo);
251 mull(y_lo); // x_lo * y_lo
252 addl(rdx, rbx); // correct hi(x_lo * y_lo)
253 }
254
255 void MacroAssembler::lneg(Register hi, Register lo) {
256 negl(lo);
257 adcl(hi, 0);
258 negl(hi);
259 }
260
261 void MacroAssembler::lshl(Register hi, Register lo) {
262 // Java shift left long support (semantics as described in JVM spec., p.305)
263 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
264 // shift value is in rcx !
265 assert(hi != rcx, "must not use rcx");
266 assert(lo != rcx, "must not use rcx");
267 const Register s = rcx; // shift count
268 const int n = BitsPerWord;
269 Label L;
270 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
271 cmpl(s, n); // if (s < n)
272 jcc(Assembler::less, L); // else (s >= n)
273 movl(hi, lo); // x := x << n
274 xorl(lo, lo);
275 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
276 bind(L); // s (mod n) < n
277 shldl(hi, lo); // x := x << s
278 shll(lo);
279 }
280
281
282 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
283 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
284 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
285 assert(hi != rcx, "must not use rcx");
286 assert(lo != rcx, "must not use rcx");
287 const Register s = rcx; // shift count
288 const int n = BitsPerWord;
289 Label L;
290 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
291 cmpl(s, n); // if (s < n)
292 jcc(Assembler::less, L); // else (s >= n)
293 movl(lo, hi); // x := x >> n
294 if (sign_extension) sarl(hi, 31);
295 else xorl(hi, hi);
296 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
297 bind(L); // s (mod n) < n
298 shrdl(lo, hi); // x := x >> s
299 if (sign_extension) sarl(hi);
300 else shrl(hi);
301 }
302
303 void MacroAssembler::movoop(Register dst, jobject obj) {
304 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
305 }
306
307 void MacroAssembler::movoop(Address dst, jobject obj) {
308 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
309 }
310
311 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
312 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
313 }
314
315 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
316 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
317 }
318
319 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
320 // scratch register is not used,
321 // it is defined to match parameters of 64-bit version of this method.
322 if (src.is_lval()) {
323 mov_literal32(dst, (intptr_t)src.target(), src.rspec());
324 } else {
325 movl(dst, as_Address(src));
326 }
327 }
328
329 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
330 movl(as_Address(dst), src);
331 }
332
333 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
334 movl(dst, as_Address(src));
335 }
336
337 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
338 void MacroAssembler::movptr(Address dst, intptr_t src) {
339 movl(dst, src);
340 }
341
342
343 void MacroAssembler::pop_callee_saved_registers() {
344 pop(rcx);
345 pop(rdx);
346 pop(rdi);
347 pop(rsi);
348 }
349
350 void MacroAssembler::push_callee_saved_registers() {
351 push(rsi);
352 push(rdi);
353 push(rdx);
354 push(rcx);
355 }
356
357 void MacroAssembler::pushoop(jobject obj) {
358 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
359 }
360
361 void MacroAssembler::pushklass(Metadata* obj) {
362 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
363 }
364
365 void MacroAssembler::pushptr(AddressLiteral src) {
366 if (src.is_lval()) {
367 push_literal32((int32_t)src.target(), src.rspec());
368 } else {
369 pushl(as_Address(src));
370 }
371 }
372
373 static void pass_arg0(MacroAssembler* masm, Register arg) {
374 masm->push(arg);
375 }
376
377 static void pass_arg1(MacroAssembler* masm, Register arg) {
378 masm->push(arg);
379 }
380
381 static void pass_arg2(MacroAssembler* masm, Register arg) {
382 masm->push(arg);
383 }
384
385 static void pass_arg3(MacroAssembler* masm, Register arg) {
386 masm->push(arg);
387 }
388
389 #ifndef PRODUCT
390 extern "C" void findpc(intptr_t x);
391 #endif
392
393 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
394 // In order to get locks to work, we need to fake a in_VM state
395 JavaThread* thread = JavaThread::current();
396 JavaThreadState saved_state = thread->thread_state();
397 thread->set_thread_state(_thread_in_vm);
398 if (ShowMessageBoxOnError) {
399 JavaThread* thread = JavaThread::current();
400 JavaThreadState saved_state = thread->thread_state();
401 thread->set_thread_state(_thread_in_vm);
402 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
403 ttyLocker ttyl;
404 BytecodeCounter::print();
405 }
406 // To see where a verify_oop failed, get $ebx+40/X for this frame.
407 // This is the value of eip which points to where verify_oop will return.
408 if (os::message_box(msg, "Execution stopped, print registers?")) {
409 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
410 BREAKPOINT;
411 }
412 }
413 fatal("DEBUG MESSAGE: %s", msg);
414 }
415
416 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
417 ttyLocker ttyl;
418 FlagSetting fs(Debugging, true);
419 tty->print_cr("eip = 0x%08x", eip);
420 #ifndef PRODUCT
421 if ((WizardMode || Verbose) && PrintMiscellaneous) {
422 tty->cr();
423 findpc(eip);
424 tty->cr();
425 }
426 #endif
427 #define PRINT_REG(rax) \
428 { tty->print("%s = ", #rax); os::print_location(tty, rax); }
429 PRINT_REG(rax);
430 PRINT_REG(rbx);
431 PRINT_REG(rcx);
432 PRINT_REG(rdx);
433 PRINT_REG(rdi);
434 PRINT_REG(rsi);
435 PRINT_REG(rbp);
436 PRINT_REG(rsp);
437 #undef PRINT_REG
438 // Print some words near top of staack.
439 int* dump_sp = (int*) rsp;
440 for (int col1 = 0; col1 < 8; col1++) {
441 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
442 os::print_location(tty, *dump_sp++);
443 }
444 for (int row = 0; row < 16; row++) {
445 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
446 for (int col = 0; col < 8; col++) {
447 tty->print(" 0x%08x", *dump_sp++);
448 }
449 tty->cr();
450 }
451 // Print some instructions around pc:
452 Disassembler::decode((address)eip-64, (address)eip);
453 tty->print_cr("--------");
454 Disassembler::decode((address)eip, (address)eip+32);
455 }
456
457 void MacroAssembler::stop(const char* msg) {
458 ExternalAddress message((address)msg);
459 // push address of message
460 pushptr(message.addr());
461 { Label L; call(L, relocInfo::none); bind(L); } // push eip
462 pusha(); // push registers
463 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
464 hlt();
465 }
466
467 void MacroAssembler::warn(const char* msg) {
468 push_CPU_state();
469
470 ExternalAddress message((address) msg);
471 // push address of message
472 pushptr(message.addr());
473
474 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
475 addl(rsp, wordSize); // discard argument
476 pop_CPU_state();
477 }
478
479 void MacroAssembler::print_state() {
480 { Label L; call(L, relocInfo::none); bind(L); } // push eip
481 pusha(); // push registers
482
483 push_CPU_state();
484 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
485 pop_CPU_state();
486
487 popa();
488 addl(rsp, wordSize);
489 }
490
491 #else // _LP64
492
493 // 64 bit versions
494
495 Address MacroAssembler::as_Address(AddressLiteral adr) {
496 // amd64 always does this as a pc-rel
497 // we can be absolute or disp based on the instruction type
498 // jmp/call are displacements others are absolute
499 assert(!adr.is_lval(), "must be rval");
500 assert(reachable(adr), "must be");
501 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
502
503 }
504
505 Address MacroAssembler::as_Address(ArrayAddress adr) {
506 AddressLiteral base = adr.base();
507 lea(rscratch1, base);
508 Address index = adr.index();
509 assert(index._disp == 0, "must not have disp"); // maybe it can?
510 Address array(rscratch1, index._index, index._scale, index._disp);
511 return array;
512 }
513
514 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
515 Label L, E;
516
517 #ifdef _WIN64
518 // Windows always allocates space for it's register args
519 assert(num_args <= 4, "only register arguments supported");
520 subq(rsp, frame::arg_reg_save_area_bytes);
521 #endif
522
523 // Align stack if necessary
524 testl(rsp, 15);
525 jcc(Assembler::zero, L);
526
527 subq(rsp, 8);
528 {
529 call(RuntimeAddress(entry_point));
530 }
531 addq(rsp, 8);
532 jmp(E);
533
534 bind(L);
535 {
536 call(RuntimeAddress(entry_point));
537 }
538
539 bind(E);
540
541 #ifdef _WIN64
542 // restore stack pointer
543 addq(rsp, frame::arg_reg_save_area_bytes);
544 #endif
545
546 }
547
548 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
549 assert(!src2.is_lval(), "should use cmpptr");
550
551 if (reachable(src2)) {
552 cmpq(src1, as_Address(src2));
553 } else {
554 lea(rscratch1, src2);
555 Assembler::cmpq(src1, Address(rscratch1, 0));
556 }
557 }
558
559 int MacroAssembler::corrected_idivq(Register reg) {
560 // Full implementation of Java ldiv and lrem; checks for special
561 // case as described in JVM spec., p.243 & p.271. The function
562 // returns the (pc) offset of the idivl instruction - may be needed
563 // for implicit exceptions.
564 //
565 // normal case special case
566 //
567 // input : rax: dividend min_long
568 // reg: divisor (may not be eax/edx) -1
569 //
570 // output: rax: quotient (= rax idiv reg) min_long
571 // rdx: remainder (= rax irem reg) 0
572 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
573 static const int64_t min_long = 0x8000000000000000;
574 Label normal_case, special_case;
575
576 // check for special case
577 cmp64(rax, ExternalAddress((address) &min_long));
578 jcc(Assembler::notEqual, normal_case);
579 xorl(rdx, rdx); // prepare rdx for possible special case (where
580 // remainder = 0)
581 cmpq(reg, -1);
582 jcc(Assembler::equal, special_case);
583
584 // handle normal case
585 bind(normal_case);
586 cdqq();
587 int idivq_offset = offset();
588 idivq(reg);
589
590 // normal and special case exit
591 bind(special_case);
592
593 return idivq_offset;
594 }
595
596 void MacroAssembler::decrementq(Register reg, int value) {
597 if (value == min_jint) { subq(reg, value); return; }
598 if (value < 0) { incrementq(reg, -value); return; }
599 if (value == 0) { ; return; }
600 if (value == 1 && UseIncDec) { decq(reg) ; return; }
601 /* else */ { subq(reg, value) ; return; }
602 }
603
604 void MacroAssembler::decrementq(Address dst, int value) {
605 if (value == min_jint) { subq(dst, value); return; }
606 if (value < 0) { incrementq(dst, -value); return; }
607 if (value == 0) { ; return; }
608 if (value == 1 && UseIncDec) { decq(dst) ; return; }
609 /* else */ { subq(dst, value) ; return; }
610 }
611
612 void MacroAssembler::incrementq(AddressLiteral dst) {
613 if (reachable(dst)) {
614 incrementq(as_Address(dst));
615 } else {
616 lea(rscratch1, dst);
617 incrementq(Address(rscratch1, 0));
618 }
619 }
620
621 void MacroAssembler::incrementq(Register reg, int value) {
622 if (value == min_jint) { addq(reg, value); return; }
623 if (value < 0) { decrementq(reg, -value); return; }
624 if (value == 0) { ; return; }
625 if (value == 1 && UseIncDec) { incq(reg) ; return; }
626 /* else */ { addq(reg, value) ; return; }
627 }
628
629 void MacroAssembler::incrementq(Address dst, int value) {
630 if (value == min_jint) { addq(dst, value); return; }
631 if (value < 0) { decrementq(dst, -value); return; }
632 if (value == 0) { ; return; }
633 if (value == 1 && UseIncDec) { incq(dst) ; return; }
634 /* else */ { addq(dst, value) ; return; }
635 }
636
637 // 32bit can do a case table jump in one instruction but we no longer allow the base
638 // to be installed in the Address class
639 void MacroAssembler::jump(ArrayAddress entry) {
640 lea(rscratch1, entry.base());
641 Address dispatch = entry.index();
642 assert(dispatch._base == noreg, "must be");
643 dispatch._base = rscratch1;
644 jmp(dispatch);
645 }
646
647 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
648 ShouldNotReachHere(); // 64bit doesn't use two regs
649 cmpq(x_lo, y_lo);
650 }
651
652 void MacroAssembler::lea(Register dst, AddressLiteral src) {
653 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
654 }
655
656 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
657 mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
658 movptr(dst, rscratch1);
659 }
660
661 void MacroAssembler::leave() {
662 // %%% is this really better? Why not on 32bit too?
663 emit_int8((unsigned char)0xC9); // LEAVE
664 }
665
666 void MacroAssembler::lneg(Register hi, Register lo) {
667 ShouldNotReachHere(); // 64bit doesn't use two regs
668 negq(lo);
669 }
670
671 void MacroAssembler::movoop(Register dst, jobject obj) {
672 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
673 }
674
675 void MacroAssembler::movoop(Address dst, jobject obj) {
676 mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
677 movq(dst, rscratch1);
678 }
679
680 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
681 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
682 }
683
684 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
685 mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
686 movq(dst, rscratch1);
687 }
688
689 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
690 if (src.is_lval()) {
691 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
692 } else {
693 if (reachable(src)) {
694 movq(dst, as_Address(src));
695 } else {
696 lea(scratch, src);
697 movq(dst, Address(scratch, 0));
698 }
699 }
700 }
701
702 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
703 movq(as_Address(dst), src);
704 }
705
706 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
707 movq(dst, as_Address(src));
708 }
709
710 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
711 void MacroAssembler::movptr(Address dst, intptr_t src) {
712 if (is_simm32(src)) {
713 movptr(dst, checked_cast<int32_t>(src));
714 } else {
715 mov64(rscratch1, src);
716 movq(dst, rscratch1);
717 }
718 }
719
720 // These are mostly for initializing NULL
721 void MacroAssembler::movptr(Address dst, int32_t src) {
722 movslq(dst, src);
723 }
724
725 void MacroAssembler::movptr(Register dst, int32_t src) {
726 mov64(dst, (intptr_t)src);
727 }
728
729 void MacroAssembler::pushoop(jobject obj) {
730 movoop(rscratch1, obj);
731 push(rscratch1);
732 }
733
734 void MacroAssembler::pushklass(Metadata* obj) {
735 mov_metadata(rscratch1, obj);
736 push(rscratch1);
737 }
738
739 void MacroAssembler::pushptr(AddressLiteral src) {
740 lea(rscratch1, src);
741 if (src.is_lval()) {
742 push(rscratch1);
743 } else {
744 pushq(Address(rscratch1, 0));
745 }
746 }
747
748 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
749 reset_last_Java_frame(r15_thread, clear_fp);
750 }
751
752 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
753 Register last_java_fp,
754 address last_java_pc) {
755 vzeroupper();
756 // determine last_java_sp register
757 if (!last_java_sp->is_valid()) {
758 last_java_sp = rsp;
759 }
760
761 // last_java_fp is optional
762 if (last_java_fp->is_valid()) {
763 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
764 last_java_fp);
765 }
766
767 // last_java_pc is optional
768 if (last_java_pc != NULL) {
769 Address java_pc(r15_thread,
770 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
771 lea(rscratch1, InternalAddress(last_java_pc));
772 movptr(java_pc, rscratch1);
773 }
774
775 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
776 }
777
778 static void pass_arg0(MacroAssembler* masm, Register arg) {
779 if (c_rarg0 != arg ) {
780 masm->mov(c_rarg0, arg);
781 }
782 }
783
784 static void pass_arg1(MacroAssembler* masm, Register arg) {
785 if (c_rarg1 != arg ) {
786 masm->mov(c_rarg1, arg);
787 }
788 }
789
790 static void pass_arg2(MacroAssembler* masm, Register arg) {
791 if (c_rarg2 != arg ) {
792 masm->mov(c_rarg2, arg);
793 }
794 }
795
796 static void pass_arg3(MacroAssembler* masm, Register arg) {
797 if (c_rarg3 != arg ) {
798 masm->mov(c_rarg3, arg);
799 }
800 }
801
802 void MacroAssembler::stop(const char* msg) {
803 if (ShowMessageBoxOnError) {
804 address rip = pc();
805 pusha(); // get regs on stack
806 lea(c_rarg1, InternalAddress(rip));
807 movq(c_rarg2, rsp); // pass pointer to regs array
808 }
809 lea(c_rarg0, ExternalAddress((address) msg));
810 andq(rsp, -16); // align stack as required by ABI
811 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
812 hlt();
813 }
814
815 void MacroAssembler::warn(const char* msg) {
816 push(rbp);
817 movq(rbp, rsp);
818 andq(rsp, -16); // align stack as required by push_CPU_state and call
819 push_CPU_state(); // keeps alignment at 16 bytes
820 lea(c_rarg0, ExternalAddress((address) msg));
821 lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
822 call(rax);
823 pop_CPU_state();
824 mov(rsp, rbp);
825 pop(rbp);
826 }
827
828 void MacroAssembler::print_state() {
829 address rip = pc();
830 pusha(); // get regs on stack
831 push(rbp);
832 movq(rbp, rsp);
833 andq(rsp, -16); // align stack as required by push_CPU_state and call
834 push_CPU_state(); // keeps alignment at 16 bytes
835
836 lea(c_rarg0, InternalAddress(rip));
837 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
838 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
839
840 pop_CPU_state();
841 mov(rsp, rbp);
842 pop(rbp);
843 popa();
844 }
845
846 #ifndef PRODUCT
847 extern "C" void findpc(intptr_t x);
848 #endif
849
850 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
851 // In order to get locks to work, we need to fake a in_VM state
852 if (ShowMessageBoxOnError) {
853 JavaThread* thread = JavaThread::current();
854 JavaThreadState saved_state = thread->thread_state();
855 thread->set_thread_state(_thread_in_vm);
856 #ifndef PRODUCT
857 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
858 ttyLocker ttyl;
859 BytecodeCounter::print();
860 }
861 #endif
862 // To see where a verify_oop failed, get $ebx+40/X for this frame.
863 // XXX correct this offset for amd64
864 // This is the value of eip which points to where verify_oop will return.
865 if (os::message_box(msg, "Execution stopped, print registers?")) {
866 print_state64(pc, regs);
867 BREAKPOINT;
868 }
869 }
870 fatal("DEBUG MESSAGE: %s", msg);
871 }
872
873 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
874 ttyLocker ttyl;
875 FlagSetting fs(Debugging, true);
876 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
877 #ifndef PRODUCT
878 tty->cr();
879 findpc(pc);
880 tty->cr();
881 #endif
882 #define PRINT_REG(rax, value) \
883 { tty->print("%s = ", #rax); os::print_location(tty, value); }
884 PRINT_REG(rax, regs[15]);
885 PRINT_REG(rbx, regs[12]);
886 PRINT_REG(rcx, regs[14]);
887 PRINT_REG(rdx, regs[13]);
888 PRINT_REG(rdi, regs[8]);
889 PRINT_REG(rsi, regs[9]);
890 PRINT_REG(rbp, regs[10]);
891 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
892 PRINT_REG(rsp, (intptr_t)(®s[16]));
893 PRINT_REG(r8 , regs[7]);
894 PRINT_REG(r9 , regs[6]);
895 PRINT_REG(r10, regs[5]);
896 PRINT_REG(r11, regs[4]);
897 PRINT_REG(r12, regs[3]);
898 PRINT_REG(r13, regs[2]);
899 PRINT_REG(r14, regs[1]);
900 PRINT_REG(r15, regs[0]);
901 #undef PRINT_REG
902 // Print some words near the top of the stack.
903 int64_t* rsp = ®s[16];
904 int64_t* dump_sp = rsp;
905 for (int col1 = 0; col1 < 8; col1++) {
906 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
907 os::print_location(tty, *dump_sp++);
908 }
909 for (int row = 0; row < 25; row++) {
910 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
911 for (int col = 0; col < 4; col++) {
912 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
913 }
914 tty->cr();
915 }
916 // Print some instructions around pc:
917 Disassembler::decode((address)pc-64, (address)pc);
918 tty->print_cr("--------");
919 Disassembler::decode((address)pc, (address)pc+32);
920 }
921
922 // The java_calling_convention describes stack locations as ideal slots on
923 // a frame with no abi restrictions. Since we must observe abi restrictions
924 // (like the placement of the register window) the slots must be biased by
925 // the following value.
926 static int reg2offset_in(VMReg r) {
927 // Account for saved rbp and return address
928 // This should really be in_preserve_stack_slots
929 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
930 }
931
932 static int reg2offset_out(VMReg r) {
933 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
934 }
935
936 // A long move
937 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst) {
938
939 // The calling conventions assures us that each VMregpair is either
940 // all really one physical register or adjacent stack slots.
941
942 if (src.is_single_phys_reg() ) {
943 if (dst.is_single_phys_reg()) {
944 if (dst.first() != src.first()) {
945 mov(dst.first()->as_Register(), src.first()->as_Register());
946 }
947 } else {
948 assert(dst.is_single_reg(), "not a stack pair");
949 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
950 }
951 } else if (dst.is_single_phys_reg()) {
952 assert(src.is_single_reg(), "not a stack pair");
953 movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
954 } else {
955 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
956 movq(rax, Address(rbp, reg2offset_in(src.first())));
957 movq(Address(rsp, reg2offset_out(dst.first())), rax);
958 }
959 }
960
961 // A double move
962 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst) {
963
964 // The calling conventions assures us that each VMregpair is either
965 // all really one physical register or adjacent stack slots.
966
967 if (src.is_single_phys_reg() ) {
968 if (dst.is_single_phys_reg()) {
969 // In theory these overlap but the ordering is such that this is likely a nop
970 if ( src.first() != dst.first()) {
971 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
972 }
973 } else {
974 assert(dst.is_single_reg(), "not a stack pair");
975 movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
976 }
977 } else if (dst.is_single_phys_reg()) {
978 assert(src.is_single_reg(), "not a stack pair");
979 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
980 } else {
981 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
982 movq(rax, Address(rbp, reg2offset_in(src.first())));
983 movq(Address(rsp, reg2offset_out(dst.first())), rax);
984 }
985 }
986
987
988 // A float arg may have to do float reg int reg conversion
989 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst) {
990 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
991
992 // The calling conventions assures us that each VMregpair is either
993 // all really one physical register or adjacent stack slots.
994
995 if (src.first()->is_stack()) {
996 if (dst.first()->is_stack()) {
997 movl(rax, Address(rbp, reg2offset_in(src.first())));
998 movptr(Address(rsp, reg2offset_out(dst.first())), rax);
999 } else {
1000 // stack to reg
1001 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1002 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
1003 }
1004 } else if (dst.first()->is_stack()) {
1005 // reg to stack
1006 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1007 movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1008 } else {
1009 // reg to reg
1010 // In theory these overlap but the ordering is such that this is likely a nop
1011 if ( src.first() != dst.first()) {
1012 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
1013 }
1014 }
1015 }
1016
1017 // On 64 bit we will store integer like items to the stack as
1018 // 64 bits items (x86_32/64 abi) even though java would only store
1019 // 32bits for a parameter. On 32bit it will simply be 32 bits
1020 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
1021 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst) {
1022 if (src.first()->is_stack()) {
1023 if (dst.first()->is_stack()) {
1024 // stack to stack
1025 movslq(rax, Address(rbp, reg2offset_in(src.first())));
1026 movq(Address(rsp, reg2offset_out(dst.first())), rax);
1027 } else {
1028 // stack to reg
1029 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1030 }
1031 } else if (dst.first()->is_stack()) {
1032 // reg to stack
1033 // Do we really have to sign extend???
1034 // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1035 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1036 } else {
1037 // Do we really have to sign extend???
1038 // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1039 if (dst.first() != src.first()) {
1040 movq(dst.first()->as_Register(), src.first()->as_Register());
1041 }
1042 }
1043 }
1044
1045 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1046 if (src.first()->is_stack()) {
1047 if (dst.first()->is_stack()) {
1048 // stack to stack
1049 movq(rax, Address(rbp, reg2offset_in(src.first())));
1050 movq(Address(rsp, reg2offset_out(dst.first())), rax);
1051 } else {
1052 // stack to reg
1053 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1054 }
1055 } else if (dst.first()->is_stack()) {
1056 // reg to stack
1057 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1058 } else {
1059 if (dst.first() != src.first()) {
1060 movq(dst.first()->as_Register(), src.first()->as_Register());
1061 }
1062 }
1063 }
1064
1065 // An oop arg. Must pass a handle not the oop itself
1066 void MacroAssembler::object_move(OopMap* map,
1067 int oop_handle_offset,
1068 int framesize_in_slots,
1069 VMRegPair src,
1070 VMRegPair dst,
1071 bool is_receiver,
1072 int* receiver_offset) {
1073
1074 // must pass a handle. First figure out the location we use as a handle
1075
1076 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1077
1078 // See if oop is NULL if it is we need no handle
1079
1080 if (src.first()->is_stack()) {
1081
1082 // Oop is already on the stack as an argument
1083 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1084 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1085 if (is_receiver) {
1086 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1087 }
1088
1089 cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1090 lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1091 // conditionally move a NULL
1092 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1093 } else {
1094
1095 // Oop is in an a register we must store it to the space we reserve
1096 // on the stack for oop_handles and pass a handle if oop is non-NULL
1097
1098 const Register rOop = src.first()->as_Register();
1099 int oop_slot;
1100 if (rOop == j_rarg0)
1101 oop_slot = 0;
1102 else if (rOop == j_rarg1)
1103 oop_slot = 1;
1104 else if (rOop == j_rarg2)
1105 oop_slot = 2;
1106 else if (rOop == j_rarg3)
1107 oop_slot = 3;
1108 else if (rOop == j_rarg4)
1109 oop_slot = 4;
1110 else {
1111 assert(rOop == j_rarg5, "wrong register");
1112 oop_slot = 5;
1113 }
1114
1115 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1116 int offset = oop_slot*VMRegImpl::stack_slot_size;
1117
1118 map->set_oop(VMRegImpl::stack2reg(oop_slot));
1119 // Store oop in handle area, may be NULL
1120 movptr(Address(rsp, offset), rOop);
1121 if (is_receiver) {
1122 *receiver_offset = offset;
1123 }
1124
1125 cmpptr(rOop, (int32_t)NULL_WORD);
1126 lea(rHandle, Address(rsp, offset));
1127 // conditionally move a NULL from the handle area where it was just stored
1128 cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1129 }
1130
1131 // If arg is on the stack then place it otherwise it is already in correct reg.
1132 if (dst.first()->is_stack()) {
1133 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1134 }
1135 }
1136
1137 #endif // _LP64
1138
1139 // Now versions that are common to 32/64 bit
1140
1141 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1142 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1143 }
1144
1145 void MacroAssembler::addptr(Register dst, Register src) {
1146 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1147 }
1148
1149 void MacroAssembler::addptr(Address dst, Register src) {
1150 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1151 }
1152
1153 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1154 if (reachable(src)) {
1155 Assembler::addsd(dst, as_Address(src));
1156 } else {
1157 lea(rscratch1, src);
1158 Assembler::addsd(dst, Address(rscratch1, 0));
1159 }
1160 }
1161
1162 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1163 if (reachable(src)) {
1164 addss(dst, as_Address(src));
1165 } else {
1166 lea(rscratch1, src);
1167 addss(dst, Address(rscratch1, 0));
1168 }
1169 }
1170
1171 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1172 if (reachable(src)) {
1173 Assembler::addpd(dst, as_Address(src));
1174 } else {
1175 lea(rscratch1, src);
1176 Assembler::addpd(dst, Address(rscratch1, 0));
1177 }
1178 }
1179
1180 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only.
1181 // Stub code is generated once and never copied.
1182 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1183 void MacroAssembler::align64() {
1184 align(64, (unsigned long long) pc());
1185 }
1186
1187 void MacroAssembler::align32() {
1188 align(32, (unsigned long long) pc());
1189 }
1190
1191 void MacroAssembler::align(int modulus) {
1192 // 8273459: Ensure alignment is possible with current segment alignment
1193 assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1194 align(modulus, offset());
1195 }
1196
1197 void MacroAssembler::align(int modulus, int target) {
1198 if (target % modulus != 0) {
1199 nop(modulus - (target % modulus));
1200 }
1201 }
1202
1203 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1204 // Used in sign-masking with aligned address.
1205 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1206 if (reachable(src)) {
1207 Assembler::andpd(dst, as_Address(src));
1208 } else {
1209 lea(scratch_reg, src);
1210 Assembler::andpd(dst, Address(scratch_reg, 0));
1211 }
1212 }
1213
1214 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1215 // Used in sign-masking with aligned address.
1216 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1217 if (reachable(src)) {
1218 Assembler::andps(dst, as_Address(src));
1219 } else {
1220 lea(scratch_reg, src);
1221 Assembler::andps(dst, Address(scratch_reg, 0));
1222 }
1223 }
1224
1225 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1226 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1227 }
1228
1229 void MacroAssembler::atomic_incl(Address counter_addr) {
1230 lock();
1231 incrementl(counter_addr);
1232 }
1233
1234 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1235 if (reachable(counter_addr)) {
1236 atomic_incl(as_Address(counter_addr));
1237 } else {
1238 lea(scr, counter_addr);
1239 atomic_incl(Address(scr, 0));
1240 }
1241 }
1242
1243 #ifdef _LP64
1244 void MacroAssembler::atomic_incq(Address counter_addr) {
1245 lock();
1246 incrementq(counter_addr);
1247 }
1248
1249 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1250 if (reachable(counter_addr)) {
1251 atomic_incq(as_Address(counter_addr));
1252 } else {
1253 lea(scr, counter_addr);
1254 atomic_incq(Address(scr, 0));
1255 }
1256 }
1257 #endif
1258
1259 // Writes to stack successive pages until offset reached to check for
1260 // stack overflow + shadow pages. This clobbers tmp.
1261 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1262 movptr(tmp, rsp);
1263 // Bang stack for total size given plus shadow page size.
1264 // Bang one page at a time because large size can bang beyond yellow and
1265 // red zones.
1266 Label loop;
1267 bind(loop);
1268 movl(Address(tmp, (-os::vm_page_size())), size );
1269 subptr(tmp, os::vm_page_size());
1270 subl(size, os::vm_page_size());
1271 jcc(Assembler::greater, loop);
1272
1273 // Bang down shadow pages too.
1274 // At this point, (tmp-0) is the last address touched, so don't
1275 // touch it again. (It was touched as (tmp-pagesize) but then tmp
1276 // was post-decremented.) Skip this address by starting at i=1, and
1277 // touch a few more pages below. N.B. It is important to touch all
1278 // the way down including all pages in the shadow zone.
1279 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1280 // this could be any sized move but this is can be a debugging crumb
1281 // so the bigger the better.
1282 movptr(Address(tmp, (-i*os::vm_page_size())), size );
1283 }
1284 }
1285
1286 void MacroAssembler::reserved_stack_check() {
1287 // testing if reserved zone needs to be enabled
1288 Label no_reserved_zone_enabling;
1289 Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1290 NOT_LP64(get_thread(rsi);)
1291
1292 cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1293 jcc(Assembler::below, no_reserved_zone_enabling);
1294
1295 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1296 jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1297 should_not_reach_here();
1298
1299 bind(no_reserved_zone_enabling);
1300 }
1301
1302 void MacroAssembler::biased_locking_enter(Register lock_reg,
1303 Register obj_reg,
1304 Register swap_reg,
1305 Register tmp_reg,
1306 Register tmp_reg2,
1307 bool swap_reg_contains_mark,
1308 Label& done,
1309 Label* slow_case,
1310 BiasedLockingCounters* counters) {
1311 assert(UseBiasedLocking, "why call this otherwise?");
1312 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1313 assert(tmp_reg != noreg, "tmp_reg must be supplied");
1314 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1315 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
1316 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
1317 NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1318
1319 if (PrintBiasedLockingStatistics && counters == NULL) {
1320 counters = BiasedLocking::counters();
1321 }
1322 // Biased locking
1323 // See whether the lock is currently biased toward our thread and
1324 // whether the epoch is still valid
1325 // Note that the runtime guarantees sufficient alignment of JavaThread
1326 // pointers to allow age to be placed into low bits
1327 // First check to see whether biasing is even enabled for this object
1328 Label cas_label;
1329 if (!swap_reg_contains_mark) {
1330 movptr(swap_reg, mark_addr);
1331 }
1332 movptr(tmp_reg, swap_reg);
1333 andptr(tmp_reg, markWord::biased_lock_mask_in_place);
1334 cmpptr(tmp_reg, markWord::biased_lock_pattern);
1335 jcc(Assembler::notEqual, cas_label);
1336 // The bias pattern is present in the object's header. Need to check
1337 // whether the bias owner and the epoch are both still current.
1338 #ifndef _LP64
1339 // Note that because there is no current thread register on x86_32 we
1340 // need to store off the mark word we read out of the object to
1341 // avoid reloading it and needing to recheck invariants below. This
1342 // store is unfortunate but it makes the overall code shorter and
1343 // simpler.
1344 movptr(saved_mark_addr, swap_reg);
1345 #endif
1346 load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1347 #ifdef _LP64
1348 orptr(tmp_reg, r15_thread);
1349 xorptr(tmp_reg, swap_reg);
1350 Register header_reg = tmp_reg;
1351 #else
1352 xorptr(tmp_reg, swap_reg);
1353 get_thread(swap_reg);
1354 xorptr(swap_reg, tmp_reg);
1355 Register header_reg = swap_reg;
1356 #endif
1357 andptr(header_reg, ~((int) markWord::age_mask_in_place));
1358 if (counters != NULL) {
1359 cond_inc32(Assembler::zero,
1360 ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1361 }
1362 jcc(Assembler::equal, done);
1363
1364 Label try_revoke_bias;
1365 Label try_rebias;
1366
1367 // At this point we know that the header has the bias pattern and
1368 // that we are not the bias owner in the current epoch. We need to
1369 // figure out more details about the state of the header in order to
1370 // know what operations can be legally performed on the object's
1371 // header.
1372
1373 // If the low three bits in the xor result aren't clear, that means
1374 // the prototype header is no longer biased and we have to revoke
1375 // the bias on this object.
1376 testptr(header_reg, markWord::biased_lock_mask_in_place);
1377 jcc(Assembler::notZero, try_revoke_bias);
1378
1379 // Biasing is still enabled for this data type. See whether the
1380 // epoch of the current bias is still valid, meaning that the epoch
1381 // bits of the mark word are equal to the epoch bits of the
1382 // prototype header. (Note that the prototype header's epoch bits
1383 // only change at a safepoint.) If not, attempt to rebias the object
1384 // toward the current thread. Note that we must be absolutely sure
1385 // that the current epoch is invalid in order to do this because
1386 // otherwise the manipulations it performs on the mark word are
1387 // illegal.
1388 testptr(header_reg, markWord::epoch_mask_in_place);
1389 jccb(Assembler::notZero, try_rebias);
1390
1391 // The epoch of the current bias is still valid but we know nothing
1392 // about the owner; it might be set or it might be clear. Try to
1393 // acquire the bias of the object using an atomic operation. If this
1394 // fails we will go in to the runtime to revoke the object's bias.
1395 // Note that we first construct the presumed unbiased header so we
1396 // don't accidentally blow away another thread's valid bias.
1397 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1398 andptr(swap_reg,
1399 markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
1400 #ifdef _LP64
1401 movptr(tmp_reg, swap_reg);
1402 orptr(tmp_reg, r15_thread);
1403 #else
1404 get_thread(tmp_reg);
1405 orptr(tmp_reg, swap_reg);
1406 #endif
1407 lock();
1408 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1409 // If the biasing toward our thread failed, this means that
1410 // another thread succeeded in biasing it toward itself and we
1411 // need to revoke that bias. The revocation will occur in the
1412 // interpreter runtime in the slow case.
1413 if (counters != NULL) {
1414 cond_inc32(Assembler::zero,
1415 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1416 }
1417 if (slow_case != NULL) {
1418 jcc(Assembler::notZero, *slow_case);
1419 }
1420 jmp(done);
1421
1422 bind(try_rebias);
1423 // At this point we know the epoch has expired, meaning that the
1424 // current "bias owner", if any, is actually invalid. Under these
1425 // circumstances _only_, we are allowed to use the current header's
1426 // value as the comparison value when doing the cas to acquire the
1427 // bias in the current epoch. In other words, we allow transfer of
1428 // the bias from one thread to another directly in this situation.
1429 //
1430 // FIXME: due to a lack of registers we currently blow away the age
1431 // bits in this situation. Should attempt to preserve them.
1432 load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1433 #ifdef _LP64
1434 orptr(tmp_reg, r15_thread);
1435 #else
1436 get_thread(swap_reg);
1437 orptr(tmp_reg, swap_reg);
1438 movptr(swap_reg, saved_mark_addr);
1439 #endif
1440 lock();
1441 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1442 // If the biasing toward our thread failed, then another thread
1443 // succeeded in biasing it toward itself and we need to revoke that
1444 // bias. The revocation will occur in the runtime in the slow case.
1445 if (counters != NULL) {
1446 cond_inc32(Assembler::zero,
1447 ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1448 }
1449 if (slow_case != NULL) {
1450 jcc(Assembler::notZero, *slow_case);
1451 }
1452 jmp(done);
1453
1454 bind(try_revoke_bias);
1455 // The prototype mark in the klass doesn't have the bias bit set any
1456 // more, indicating that objects of this data type are not supposed
1457 // to be biased any more. We are going to try to reset the mark of
1458 // this object to the prototype value and fall through to the
1459 // CAS-based locking scheme. Note that if our CAS fails, it means
1460 // that another thread raced us for the privilege of revoking the
1461 // bias of this particular object, so it's okay to continue in the
1462 // normal locking code.
1463 //
1464 // FIXME: due to a lack of registers we currently blow away the age
1465 // bits in this situation. Should attempt to preserve them.
1466 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1467 load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1468 lock();
1469 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1470 // Fall through to the normal CAS-based lock, because no matter what
1471 // the result of the above CAS, some thread must have succeeded in
1472 // removing the bias bit from the object's header.
1473 if (counters != NULL) {
1474 cond_inc32(Assembler::zero,
1475 ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1476 }
1477
1478 bind(cas_label);
1479 }
1480
1481 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1482 assert(UseBiasedLocking, "why call this otherwise?");
1483
1484 // Check for biased locking unlock case, which is a no-op
1485 // Note: we do not have to check the thread ID for two reasons.
1486 // First, the interpreter checks for IllegalMonitorStateException at
1487 // a higher level. Second, if the bias was revoked while we held the
1488 // lock, the object could not be rebiased toward another thread, so
1489 // the bias bit would be clear.
1490 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1491 andptr(temp_reg, markWord::biased_lock_mask_in_place);
1492 cmpptr(temp_reg, markWord::biased_lock_pattern);
1493 jcc(Assembler::equal, done);
1494 }
1495
1496 void MacroAssembler::c2bool(Register x) {
1497 // implements x == 0 ? 0 : 1
1498 // note: must only look at least-significant byte of x
1499 // since C-style booleans are stored in one byte
1500 // only! (was bug)
1501 andl(x, 0xFF);
1502 setb(Assembler::notZero, x);
1503 }
1504
1505 // Wouldn't need if AddressLiteral version had new name
1506 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1507 Assembler::call(L, rtype);
1508 }
1509
1510 void MacroAssembler::call(Register entry) {
1511 Assembler::call(entry);
1512 }
1513
1514 void MacroAssembler::call(AddressLiteral entry) {
1515 if (reachable(entry)) {
1516 Assembler::call_literal(entry.target(), entry.rspec());
1517 } else {
1518 lea(rscratch1, entry);
1519 Assembler::call(rscratch1);
1520 }
1521 }
1522
1523 void MacroAssembler::ic_call(address entry, jint method_index) {
1524 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1525 movptr(rax, (intptr_t)Universe::non_oop_word());
1526 call(AddressLiteral(entry, rh));
1527 }
1528
1529 // Implementation of call_VM versions
1530
1531 void MacroAssembler::call_VM(Register oop_result,
1532 address entry_point,
1533 bool check_exceptions) {
1534 Label C, E;
1535 call(C, relocInfo::none);
1536 jmp(E);
1537
1538 bind(C);
1539 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1540 ret(0);
1541
1542 bind(E);
1543 }
1544
1545 void MacroAssembler::call_VM(Register oop_result,
1546 address entry_point,
1547 Register arg_1,
1548 bool check_exceptions) {
1549 Label C, E;
1550 call(C, relocInfo::none);
1551 jmp(E);
1552
1553 bind(C);
1554 pass_arg1(this, arg_1);
1555 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1556 ret(0);
1557
1558 bind(E);
1559 }
1560
1561 void MacroAssembler::call_VM(Register oop_result,
1562 address entry_point,
1563 Register arg_1,
1564 Register arg_2,
1565 bool check_exceptions) {
1566 Label C, E;
1567 call(C, relocInfo::none);
1568 jmp(E);
1569
1570 bind(C);
1571
1572 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1573
1574 pass_arg2(this, arg_2);
1575 pass_arg1(this, arg_1);
1576 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1577 ret(0);
1578
1579 bind(E);
1580 }
1581
1582 void MacroAssembler::call_VM(Register oop_result,
1583 address entry_point,
1584 Register arg_1,
1585 Register arg_2,
1586 Register arg_3,
1587 bool check_exceptions) {
1588 Label C, E;
1589 call(C, relocInfo::none);
1590 jmp(E);
1591
1592 bind(C);
1593
1594 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1595 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1596 pass_arg3(this, arg_3);
1597
1598 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1599 pass_arg2(this, arg_2);
1600
1601 pass_arg1(this, arg_1);
1602 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1603 ret(0);
1604
1605 bind(E);
1606 }
1607
1608 void MacroAssembler::call_VM(Register oop_result,
1609 Register last_java_sp,
1610 address entry_point,
1611 int number_of_arguments,
1612 bool check_exceptions) {
1613 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1614 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1615 }
1616
1617 void MacroAssembler::call_VM(Register oop_result,
1618 Register last_java_sp,
1619 address entry_point,
1620 Register arg_1,
1621 bool check_exceptions) {
1622 pass_arg1(this, arg_1);
1623 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1624 }
1625
1626 void MacroAssembler::call_VM(Register oop_result,
1627 Register last_java_sp,
1628 address entry_point,
1629 Register arg_1,
1630 Register arg_2,
1631 bool check_exceptions) {
1632
1633 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1634 pass_arg2(this, arg_2);
1635 pass_arg1(this, arg_1);
1636 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1637 }
1638
1639 void MacroAssembler::call_VM(Register oop_result,
1640 Register last_java_sp,
1641 address entry_point,
1642 Register arg_1,
1643 Register arg_2,
1644 Register arg_3,
1645 bool check_exceptions) {
1646 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1647 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1648 pass_arg3(this, arg_3);
1649 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1650 pass_arg2(this, arg_2);
1651 pass_arg1(this, arg_1);
1652 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1653 }
1654
1655 void MacroAssembler::super_call_VM(Register oop_result,
1656 Register last_java_sp,
1657 address entry_point,
1658 int number_of_arguments,
1659 bool check_exceptions) {
1660 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1661 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1662 }
1663
1664 void MacroAssembler::super_call_VM(Register oop_result,
1665 Register last_java_sp,
1666 address entry_point,
1667 Register arg_1,
1668 bool check_exceptions) {
1669 pass_arg1(this, arg_1);
1670 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1671 }
1672
1673 void MacroAssembler::super_call_VM(Register oop_result,
1674 Register last_java_sp,
1675 address entry_point,
1676 Register arg_1,
1677 Register arg_2,
1678 bool check_exceptions) {
1679
1680 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1681 pass_arg2(this, arg_2);
1682 pass_arg1(this, arg_1);
1683 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1684 }
1685
1686 void MacroAssembler::super_call_VM(Register oop_result,
1687 Register last_java_sp,
1688 address entry_point,
1689 Register arg_1,
1690 Register arg_2,
1691 Register arg_3,
1692 bool check_exceptions) {
1693 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1694 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1695 pass_arg3(this, arg_3);
1696 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1697 pass_arg2(this, arg_2);
1698 pass_arg1(this, arg_1);
1699 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1700 }
1701
1702 void MacroAssembler::call_VM_base(Register oop_result,
1703 Register java_thread,
1704 Register last_java_sp,
1705 address entry_point,
1706 int number_of_arguments,
1707 bool check_exceptions) {
1708 // determine java_thread register
1709 if (!java_thread->is_valid()) {
1710 #ifdef _LP64
1711 java_thread = r15_thread;
1712 #else
1713 java_thread = rdi;
1714 get_thread(java_thread);
1715 #endif // LP64
1716 }
1717 // determine last_java_sp register
1718 if (!last_java_sp->is_valid()) {
1719 last_java_sp = rsp;
1720 }
1721 // debugging support
1722 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1723 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1724 #ifdef ASSERT
1725 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1726 // r12 is the heapbase.
1727 LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1728 #endif // ASSERT
1729
1730 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1731 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1732
1733 // push java thread (becomes first argument of C function)
1734
1735 NOT_LP64(push(java_thread); number_of_arguments++);
1736 LP64_ONLY(mov(c_rarg0, r15_thread));
1737
1738 // set last Java frame before call
1739 assert(last_java_sp != rbp, "can't use ebp/rbp");
1740
1741 // Only interpreter should have to set fp
1742 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1743
1744 // do the call, remove parameters
1745 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1746
1747 // restore the thread (cannot use the pushed argument since arguments
1748 // may be overwritten by C code generated by an optimizing compiler);
1749 // however can use the register value directly if it is callee saved.
1750 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1751 // rdi & rsi (also r15) are callee saved -> nothing to do
1752 #ifdef ASSERT
1753 guarantee(java_thread != rax, "change this code");
1754 push(rax);
1755 { Label L;
1756 get_thread(rax);
1757 cmpptr(java_thread, rax);
1758 jcc(Assembler::equal, L);
1759 STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1760 bind(L);
1761 }
1762 pop(rax);
1763 #endif
1764 } else {
1765 get_thread(java_thread);
1766 }
1767 // reset last Java frame
1768 // Only interpreter should have to clear fp
1769 reset_last_Java_frame(java_thread, true);
1770
1771 // C++ interp handles this in the interpreter
1772 check_and_handle_popframe(java_thread);
1773 check_and_handle_earlyret(java_thread);
1774
1775 if (check_exceptions) {
1776 // check for pending exceptions (java_thread is set upon return)
1777 cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1778 #ifndef _LP64
1779 jump_cc(Assembler::notEqual,
1780 RuntimeAddress(StubRoutines::forward_exception_entry()));
1781 #else
1782 // This used to conditionally jump to forward_exception however it is
1783 // possible if we relocate that the branch will not reach. So we must jump
1784 // around so we can always reach
1785
1786 Label ok;
1787 jcc(Assembler::equal, ok);
1788 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1789 bind(ok);
1790 #endif // LP64
1791 }
1792
1793 // get oop result if there is one and reset the value in the thread
1794 if (oop_result->is_valid()) {
1795 get_vm_result(oop_result, java_thread);
1796 }
1797 }
1798
1799 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1800
1801 // Calculate the value for last_Java_sp
1802 // somewhat subtle. call_VM does an intermediate call
1803 // which places a return address on the stack just under the
1804 // stack pointer as the user finsihed with it. This allows
1805 // use to retrieve last_Java_pc from last_Java_sp[-1].
1806 // On 32bit we then have to push additional args on the stack to accomplish
1807 // the actual requested call. On 64bit call_VM only can use register args
1808 // so the only extra space is the return address that call_VM created.
1809 // This hopefully explains the calculations here.
1810
1811 #ifdef _LP64
1812 // We've pushed one address, correct last_Java_sp
1813 lea(rax, Address(rsp, wordSize));
1814 #else
1815 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1816 #endif // LP64
1817
1818 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1819
1820 }
1821
1822 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1823 void MacroAssembler::call_VM_leaf0(address entry_point) {
1824 MacroAssembler::call_VM_leaf_base(entry_point, 0);
1825 }
1826
1827 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1828 call_VM_leaf_base(entry_point, number_of_arguments);
1829 }
1830
1831 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1832 pass_arg0(this, arg_0);
1833 call_VM_leaf(entry_point, 1);
1834 }
1835
1836 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1837
1838 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1839 pass_arg1(this, arg_1);
1840 pass_arg0(this, arg_0);
1841 call_VM_leaf(entry_point, 2);
1842 }
1843
1844 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1845 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1846 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1847 pass_arg2(this, arg_2);
1848 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1849 pass_arg1(this, arg_1);
1850 pass_arg0(this, arg_0);
1851 call_VM_leaf(entry_point, 3);
1852 }
1853
1854 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1855 pass_arg0(this, arg_0);
1856 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1857 }
1858
1859 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1860
1861 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1862 pass_arg1(this, arg_1);
1863 pass_arg0(this, arg_0);
1864 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1865 }
1866
1867 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1868 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1869 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1870 pass_arg2(this, arg_2);
1871 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1872 pass_arg1(this, arg_1);
1873 pass_arg0(this, arg_0);
1874 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1875 }
1876
1877 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1878 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1879 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1880 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1881 pass_arg3(this, arg_3);
1882 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1883 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1884 pass_arg2(this, arg_2);
1885 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1886 pass_arg1(this, arg_1);
1887 pass_arg0(this, arg_0);
1888 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1889 }
1890
1891 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1892 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1893 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1894 verify_oop_msg(oop_result, "broken oop in call_VM_base");
1895 }
1896
1897 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1898 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1899 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1900 }
1901
1902 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1903 }
1904
1905 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1906 }
1907
1908 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1909 if (reachable(src1)) {
1910 cmpl(as_Address(src1), imm);
1911 } else {
1912 lea(rscratch1, src1);
1913 cmpl(Address(rscratch1, 0), imm);
1914 }
1915 }
1916
1917 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1918 assert(!src2.is_lval(), "use cmpptr");
1919 if (reachable(src2)) {
1920 cmpl(src1, as_Address(src2));
1921 } else {
1922 lea(rscratch1, src2);
1923 cmpl(src1, Address(rscratch1, 0));
1924 }
1925 }
1926
1927 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1928 Assembler::cmpl(src1, imm);
1929 }
1930
1931 void MacroAssembler::cmp32(Register src1, Address src2) {
1932 Assembler::cmpl(src1, src2);
1933 }
1934
1935 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1936 ucomisd(opr1, opr2);
1937
1938 Label L;
1939 if (unordered_is_less) {
1940 movl(dst, -1);
1941 jcc(Assembler::parity, L);
1942 jcc(Assembler::below , L);
1943 movl(dst, 0);
1944 jcc(Assembler::equal , L);
1945 increment(dst);
1946 } else { // unordered is greater
1947 movl(dst, 1);
1948 jcc(Assembler::parity, L);
1949 jcc(Assembler::above , L);
1950 movl(dst, 0);
1951 jcc(Assembler::equal , L);
1952 decrementl(dst);
1953 }
1954 bind(L);
1955 }
1956
1957 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1958 ucomiss(opr1, opr2);
1959
1960 Label L;
1961 if (unordered_is_less) {
1962 movl(dst, -1);
1963 jcc(Assembler::parity, L);
1964 jcc(Assembler::below , L);
1965 movl(dst, 0);
1966 jcc(Assembler::equal , L);
1967 increment(dst);
1968 } else { // unordered is greater
1969 movl(dst, 1);
1970 jcc(Assembler::parity, L);
1971 jcc(Assembler::above , L);
1972 movl(dst, 0);
1973 jcc(Assembler::equal , L);
1974 decrementl(dst);
1975 }
1976 bind(L);
1977 }
1978
1979
1980 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1981 if (reachable(src1)) {
1982 cmpb(as_Address(src1), imm);
1983 } else {
1984 lea(rscratch1, src1);
1985 cmpb(Address(rscratch1, 0), imm);
1986 }
1987 }
1988
1989 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1990 #ifdef _LP64
1991 if (src2.is_lval()) {
1992 movptr(rscratch1, src2);
1993 Assembler::cmpq(src1, rscratch1);
1994 } else if (reachable(src2)) {
1995 cmpq(src1, as_Address(src2));
1996 } else {
1997 lea(rscratch1, src2);
1998 Assembler::cmpq(src1, Address(rscratch1, 0));
1999 }
2000 #else
2001 if (src2.is_lval()) {
2002 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2003 } else {
2004 cmpl(src1, as_Address(src2));
2005 }
2006 #endif // _LP64
2007 }
2008
2009 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2010 assert(src2.is_lval(), "not a mem-mem compare");
2011 #ifdef _LP64
2012 // moves src2's literal address
2013 movptr(rscratch1, src2);
2014 Assembler::cmpq(src1, rscratch1);
2015 #else
2016 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2017 #endif // _LP64
2018 }
2019
2020 void MacroAssembler::cmpoop(Register src1, Register src2) {
2021 cmpptr(src1, src2);
2022 }
2023
2024 void MacroAssembler::cmpoop(Register src1, Address src2) {
2025 cmpptr(src1, src2);
2026 }
2027
2028 #ifdef _LP64
2029 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2030 movoop(rscratch1, src2);
2031 cmpptr(src1, rscratch1);
2032 }
2033 #endif
2034
2035 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2036 if (reachable(adr)) {
2037 lock();
2038 cmpxchgptr(reg, as_Address(adr));
2039 } else {
2040 lea(rscratch1, adr);
2041 lock();
2042 cmpxchgptr(reg, Address(rscratch1, 0));
2043 }
2044 }
2045
2046 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2047 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2048 }
2049
2050 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2051 if (reachable(src)) {
2052 Assembler::comisd(dst, as_Address(src));
2053 } else {
2054 lea(rscratch1, src);
2055 Assembler::comisd(dst, Address(rscratch1, 0));
2056 }
2057 }
2058
2059 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2060 if (reachable(src)) {
2061 Assembler::comiss(dst, as_Address(src));
2062 } else {
2063 lea(rscratch1, src);
2064 Assembler::comiss(dst, Address(rscratch1, 0));
2065 }
2066 }
2067
2068
2069 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2070 Condition negated_cond = negate_condition(cond);
2071 Label L;
2072 jcc(negated_cond, L);
2073 pushf(); // Preserve flags
2074 atomic_incl(counter_addr);
2075 popf();
2076 bind(L);
2077 }
2078
2079 int MacroAssembler::corrected_idivl(Register reg) {
2080 // Full implementation of Java idiv and irem; checks for
2081 // special case as described in JVM spec., p.243 & p.271.
2082 // The function returns the (pc) offset of the idivl
2083 // instruction - may be needed for implicit exceptions.
2084 //
2085 // normal case special case
2086 //
2087 // input : rax,: dividend min_int
2088 // reg: divisor (may not be rax,/rdx) -1
2089 //
2090 // output: rax,: quotient (= rax, idiv reg) min_int
2091 // rdx: remainder (= rax, irem reg) 0
2092 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2093 const int min_int = 0x80000000;
2094 Label normal_case, special_case;
2095
2096 // check for special case
2097 cmpl(rax, min_int);
2098 jcc(Assembler::notEqual, normal_case);
2099 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2100 cmpl(reg, -1);
2101 jcc(Assembler::equal, special_case);
2102
2103 // handle normal case
2104 bind(normal_case);
2105 cdql();
2106 int idivl_offset = offset();
2107 idivl(reg);
2108
2109 // normal and special case exit
2110 bind(special_case);
2111
2112 return idivl_offset;
2113 }
2114
2115
2116
2117 void MacroAssembler::decrementl(Register reg, int value) {
2118 if (value == min_jint) {subl(reg, value) ; return; }
2119 if (value < 0) { incrementl(reg, -value); return; }
2120 if (value == 0) { ; return; }
2121 if (value == 1 && UseIncDec) { decl(reg) ; return; }
2122 /* else */ { subl(reg, value) ; return; }
2123 }
2124
2125 void MacroAssembler::decrementl(Address dst, int value) {
2126 if (value == min_jint) {subl(dst, value) ; return; }
2127 if (value < 0) { incrementl(dst, -value); return; }
2128 if (value == 0) { ; return; }
2129 if (value == 1 && UseIncDec) { decl(dst) ; return; }
2130 /* else */ { subl(dst, value) ; return; }
2131 }
2132
2133 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2134 assert (shift_value > 0, "illegal shift value");
2135 Label _is_positive;
2136 testl (reg, reg);
2137 jcc (Assembler::positive, _is_positive);
2138 int offset = (1 << shift_value) - 1 ;
2139
2140 if (offset == 1) {
2141 incrementl(reg);
2142 } else {
2143 addl(reg, offset);
2144 }
2145
2146 bind (_is_positive);
2147 sarl(reg, shift_value);
2148 }
2149
2150 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2151 if (reachable(src)) {
2152 Assembler::divsd(dst, as_Address(src));
2153 } else {
2154 lea(rscratch1, src);
2155 Assembler::divsd(dst, Address(rscratch1, 0));
2156 }
2157 }
2158
2159 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2160 if (reachable(src)) {
2161 Assembler::divss(dst, as_Address(src));
2162 } else {
2163 lea(rscratch1, src);
2164 Assembler::divss(dst, Address(rscratch1, 0));
2165 }
2166 }
2167
2168 void MacroAssembler::enter() {
2169 push(rbp);
2170 mov(rbp, rsp);
2171 }
2172
2173 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2174 void MacroAssembler::fat_nop() {
2175 if (UseAddressNop) {
2176 addr_nop_5();
2177 } else {
2178 emit_int8(0x26); // es:
2179 emit_int8(0x2e); // cs:
2180 emit_int8(0x64); // fs:
2181 emit_int8(0x65); // gs:
2182 emit_int8((unsigned char)0x90);
2183 }
2184 }
2185
2186 #ifndef _LP64
2187 void MacroAssembler::fcmp(Register tmp) {
2188 fcmp(tmp, 1, true, true);
2189 }
2190
2191 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2192 assert(!pop_right || pop_left, "usage error");
2193 if (VM_Version::supports_cmov()) {
2194 assert(tmp == noreg, "unneeded temp");
2195 if (pop_left) {
2196 fucomip(index);
2197 } else {
2198 fucomi(index);
2199 }
2200 if (pop_right) {
2201 fpop();
2202 }
2203 } else {
2204 assert(tmp != noreg, "need temp");
2205 if (pop_left) {
2206 if (pop_right) {
2207 fcompp();
2208 } else {
2209 fcomp(index);
2210 }
2211 } else {
2212 fcom(index);
2213 }
2214 // convert FPU condition into eflags condition via rax,
2215 save_rax(tmp);
2216 fwait(); fnstsw_ax();
2217 sahf();
2218 restore_rax(tmp);
2219 }
2220 // condition codes set as follows:
2221 //
2222 // CF (corresponds to C0) if x < y
2223 // PF (corresponds to C2) if unordered
2224 // ZF (corresponds to C3) if x = y
2225 }
2226
2227 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2228 fcmp2int(dst, unordered_is_less, 1, true, true);
2229 }
2230
2231 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2232 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2233 Label L;
2234 if (unordered_is_less) {
2235 movl(dst, -1);
2236 jcc(Assembler::parity, L);
2237 jcc(Assembler::below , L);
2238 movl(dst, 0);
2239 jcc(Assembler::equal , L);
2240 increment(dst);
2241 } else { // unordered is greater
2242 movl(dst, 1);
2243 jcc(Assembler::parity, L);
2244 jcc(Assembler::above , L);
2245 movl(dst, 0);
2246 jcc(Assembler::equal , L);
2247 decrementl(dst);
2248 }
2249 bind(L);
2250 }
2251
2252 void MacroAssembler::fld_d(AddressLiteral src) {
2253 fld_d(as_Address(src));
2254 }
2255
2256 void MacroAssembler::fld_s(AddressLiteral src) {
2257 fld_s(as_Address(src));
2258 }
2259
2260 void MacroAssembler::fldcw(AddressLiteral src) {
2261 Assembler::fldcw(as_Address(src));
2262 }
2263
2264 void MacroAssembler::fpop() {
2265 ffree();
2266 fincstp();
2267 }
2268
2269 void MacroAssembler::fremr(Register tmp) {
2270 save_rax(tmp);
2271 { Label L;
2272 bind(L);
2273 fprem();
2274 fwait(); fnstsw_ax();
2275 sahf();
2276 jcc(Assembler::parity, L);
2277 }
2278 restore_rax(tmp);
2279 // Result is in ST0.
2280 // Note: fxch & fpop to get rid of ST1
2281 // (otherwise FPU stack could overflow eventually)
2282 fxch(1);
2283 fpop();
2284 }
2285
2286 void MacroAssembler::empty_FPU_stack() {
2287 if (VM_Version::supports_mmx()) {
2288 emms();
2289 } else {
2290 for (int i = 8; i-- > 0; ) ffree(i);
2291 }
2292 }
2293 #endif // !LP64
2294
2295 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2296 if (reachable(src)) {
2297 Assembler::mulpd(dst, as_Address(src));
2298 } else {
2299 lea(rscratch1, src);
2300 Assembler::mulpd(dst, Address(rscratch1, 0));
2301 }
2302 }
2303
2304 void MacroAssembler::load_float(Address src) {
2305 #ifdef _LP64
2306 movflt(xmm0, src);
2307 #else
2308 if (UseSSE >= 1) {
2309 movflt(xmm0, src);
2310 } else {
2311 fld_s(src);
2312 }
2313 #endif // LP64
2314 }
2315
2316 void MacroAssembler::store_float(Address dst) {
2317 #ifdef _LP64
2318 movflt(dst, xmm0);
2319 #else
2320 if (UseSSE >= 1) {
2321 movflt(dst, xmm0);
2322 } else {
2323 fstp_s(dst);
2324 }
2325 #endif // LP64
2326 }
2327
2328 void MacroAssembler::load_double(Address src) {
2329 #ifdef _LP64
2330 movdbl(xmm0, src);
2331 #else
2332 if (UseSSE >= 2) {
2333 movdbl(xmm0, src);
2334 } else {
2335 fld_d(src);
2336 }
2337 #endif // LP64
2338 }
2339
2340 void MacroAssembler::store_double(Address dst) {
2341 #ifdef _LP64
2342 movdbl(dst, xmm0);
2343 #else
2344 if (UseSSE >= 2) {
2345 movdbl(dst, xmm0);
2346 } else {
2347 fstp_d(dst);
2348 }
2349 #endif // LP64
2350 }
2351
2352 // dst = c = a * b + c
2353 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2354 Assembler::vfmadd231sd(c, a, b);
2355 if (dst != c) {
2356 movdbl(dst, c);
2357 }
2358 }
2359
2360 // dst = c = a * b + c
2361 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2362 Assembler::vfmadd231ss(c, a, b);
2363 if (dst != c) {
2364 movflt(dst, c);
2365 }
2366 }
2367
2368 // dst = c = a * b + c
2369 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2370 Assembler::vfmadd231pd(c, a, b, vector_len);
2371 if (dst != c) {
2372 vmovdqu(dst, c);
2373 }
2374 }
2375
2376 // dst = c = a * b + c
2377 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2378 Assembler::vfmadd231ps(c, a, b, vector_len);
2379 if (dst != c) {
2380 vmovdqu(dst, c);
2381 }
2382 }
2383
2384 // dst = c = a * b + c
2385 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2386 Assembler::vfmadd231pd(c, a, b, vector_len);
2387 if (dst != c) {
2388 vmovdqu(dst, c);
2389 }
2390 }
2391
2392 // dst = c = a * b + c
2393 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2394 Assembler::vfmadd231ps(c, a, b, vector_len);
2395 if (dst != c) {
2396 vmovdqu(dst, c);
2397 }
2398 }
2399
2400 void MacroAssembler::incrementl(AddressLiteral dst) {
2401 if (reachable(dst)) {
2402 incrementl(as_Address(dst));
2403 } else {
2404 lea(rscratch1, dst);
2405 incrementl(Address(rscratch1, 0));
2406 }
2407 }
2408
2409 void MacroAssembler::incrementl(ArrayAddress dst) {
2410 incrementl(as_Address(dst));
2411 }
2412
2413 void MacroAssembler::incrementl(Register reg, int value) {
2414 if (value == min_jint) {addl(reg, value) ; return; }
2415 if (value < 0) { decrementl(reg, -value); return; }
2416 if (value == 0) { ; return; }
2417 if (value == 1 && UseIncDec) { incl(reg) ; return; }
2418 /* else */ { addl(reg, value) ; return; }
2419 }
2420
2421 void MacroAssembler::incrementl(Address dst, int value) {
2422 if (value == min_jint) {addl(dst, value) ; return; }
2423 if (value < 0) { decrementl(dst, -value); return; }
2424 if (value == 0) { ; return; }
2425 if (value == 1 && UseIncDec) { incl(dst) ; return; }
2426 /* else */ { addl(dst, value) ; return; }
2427 }
2428
2429 void MacroAssembler::jump(AddressLiteral dst) {
2430 if (reachable(dst)) {
2431 jmp_literal(dst.target(), dst.rspec());
2432 } else {
2433 lea(rscratch1, dst);
2434 jmp(rscratch1);
2435 }
2436 }
2437
2438 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2439 if (reachable(dst)) {
2440 InstructionMark im(this);
2441 relocate(dst.reloc());
2442 const int short_size = 2;
2443 const int long_size = 6;
2444 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2445 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2446 // 0111 tttn #8-bit disp
2447 emit_int8(0x70 | cc);
2448 emit_int8((offs - short_size) & 0xFF);
2449 } else {
2450 // 0000 1111 1000 tttn #32-bit disp
2451 emit_int8(0x0F);
2452 emit_int8((unsigned char)(0x80 | cc));
2453 emit_int32(offs - long_size);
2454 }
2455 } else {
2456 #ifdef ASSERT
2457 warning("reversing conditional branch");
2458 #endif /* ASSERT */
2459 Label skip;
2460 jccb(reverse[cc], skip);
2461 lea(rscratch1, dst);
2462 Assembler::jmp(rscratch1);
2463 bind(skip);
2464 }
2465 }
2466
2467 void MacroAssembler::fld_x(AddressLiteral src) {
2468 Assembler::fld_x(as_Address(src));
2469 }
2470
2471 void MacroAssembler::ldmxcsr(AddressLiteral src) {
2472 if (reachable(src)) {
2473 Assembler::ldmxcsr(as_Address(src));
2474 } else {
2475 lea(rscratch1, src);
2476 Assembler::ldmxcsr(Address(rscratch1, 0));
2477 }
2478 }
2479
2480 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2481 int off;
2482 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2483 off = offset();
2484 movsbl(dst, src); // movsxb
2485 } else {
2486 off = load_unsigned_byte(dst, src);
2487 shll(dst, 24);
2488 sarl(dst, 24);
2489 }
2490 return off;
2491 }
2492
2493 // Note: load_signed_short used to be called load_signed_word.
2494 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2495 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2496 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2497 int MacroAssembler::load_signed_short(Register dst, Address src) {
2498 int off;
2499 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2500 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2501 // version but this is what 64bit has always done. This seems to imply
2502 // that users are only using 32bits worth.
2503 off = offset();
2504 movswl(dst, src); // movsxw
2505 } else {
2506 off = load_unsigned_short(dst, src);
2507 shll(dst, 16);
2508 sarl(dst, 16);
2509 }
2510 return off;
2511 }
2512
2513 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2514 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2515 // and "3.9 Partial Register Penalties", p. 22).
2516 int off;
2517 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2518 off = offset();
2519 movzbl(dst, src); // movzxb
2520 } else {
2521 xorl(dst, dst);
2522 off = offset();
2523 movb(dst, src);
2524 }
2525 return off;
2526 }
2527
2528 // Note: load_unsigned_short used to be called load_unsigned_word.
2529 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2530 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2531 // and "3.9 Partial Register Penalties", p. 22).
2532 int off;
2533 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2534 off = offset();
2535 movzwl(dst, src); // movzxw
2536 } else {
2537 xorl(dst, dst);
2538 off = offset();
2539 movw(dst, src);
2540 }
2541 return off;
2542 }
2543
2544 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2545 switch (size_in_bytes) {
2546 #ifndef _LP64
2547 case 8:
2548 assert(dst2 != noreg, "second dest register required");
2549 movl(dst, src);
2550 movl(dst2, src.plus_disp(BytesPerInt));
2551 break;
2552 #else
2553 case 8: movq(dst, src); break;
2554 #endif
2555 case 4: movl(dst, src); break;
2556 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2557 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2558 default: ShouldNotReachHere();
2559 }
2560 }
2561
2562 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2563 switch (size_in_bytes) {
2564 #ifndef _LP64
2565 case 8:
2566 assert(src2 != noreg, "second source register required");
2567 movl(dst, src);
2568 movl(dst.plus_disp(BytesPerInt), src2);
2569 break;
2570 #else
2571 case 8: movq(dst, src); break;
2572 #endif
2573 case 4: movl(dst, src); break;
2574 case 2: movw(dst, src); break;
2575 case 1: movb(dst, src); break;
2576 default: ShouldNotReachHere();
2577 }
2578 }
2579
2580 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2581 if (reachable(dst)) {
2582 movl(as_Address(dst), src);
2583 } else {
2584 lea(rscratch1, dst);
2585 movl(Address(rscratch1, 0), src);
2586 }
2587 }
2588
2589 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2590 if (reachable(src)) {
2591 movl(dst, as_Address(src));
2592 } else {
2593 lea(rscratch1, src);
2594 movl(dst, Address(rscratch1, 0));
2595 }
2596 }
2597
2598 // C++ bool manipulation
2599
2600 void MacroAssembler::movbool(Register dst, Address src) {
2601 if(sizeof(bool) == 1)
2602 movb(dst, src);
2603 else if(sizeof(bool) == 2)
2604 movw(dst, src);
2605 else if(sizeof(bool) == 4)
2606 movl(dst, src);
2607 else
2608 // unsupported
2609 ShouldNotReachHere();
2610 }
2611
2612 void MacroAssembler::movbool(Address dst, bool boolconst) {
2613 if(sizeof(bool) == 1)
2614 movb(dst, (int) boolconst);
2615 else if(sizeof(bool) == 2)
2616 movw(dst, (int) boolconst);
2617 else if(sizeof(bool) == 4)
2618 movl(dst, (int) boolconst);
2619 else
2620 // unsupported
2621 ShouldNotReachHere();
2622 }
2623
2624 void MacroAssembler::movbool(Address dst, Register src) {
2625 if(sizeof(bool) == 1)
2626 movb(dst, src);
2627 else if(sizeof(bool) == 2)
2628 movw(dst, src);
2629 else if(sizeof(bool) == 4)
2630 movl(dst, src);
2631 else
2632 // unsupported
2633 ShouldNotReachHere();
2634 }
2635
2636 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2637 movb(as_Address(dst), src);
2638 }
2639
2640 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2641 if (reachable(src)) {
2642 movdl(dst, as_Address(src));
2643 } else {
2644 lea(rscratch1, src);
2645 movdl(dst, Address(rscratch1, 0));
2646 }
2647 }
2648
2649 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2650 if (reachable(src)) {
2651 movq(dst, as_Address(src));
2652 } else {
2653 lea(rscratch1, src);
2654 movq(dst, Address(rscratch1, 0));
2655 }
2656 }
2657
2658 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2659 if (reachable(src)) {
2660 if (UseXmmLoadAndClearUpper) {
2661 movsd (dst, as_Address(src));
2662 } else {
2663 movlpd(dst, as_Address(src));
2664 }
2665 } else {
2666 lea(rscratch1, src);
2667 if (UseXmmLoadAndClearUpper) {
2668 movsd (dst, Address(rscratch1, 0));
2669 } else {
2670 movlpd(dst, Address(rscratch1, 0));
2671 }
2672 }
2673 }
2674
2675 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2676 if (reachable(src)) {
2677 movss(dst, as_Address(src));
2678 } else {
2679 lea(rscratch1, src);
2680 movss(dst, Address(rscratch1, 0));
2681 }
2682 }
2683
2684 void MacroAssembler::movptr(Register dst, Register src) {
2685 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2686 }
2687
2688 void MacroAssembler::movptr(Register dst, Address src) {
2689 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2690 }
2691
2692 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2693 void MacroAssembler::movptr(Register dst, intptr_t src) {
2694 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2695 }
2696
2697 void MacroAssembler::movptr(Address dst, Register src) {
2698 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2699 }
2700
2701 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2702 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2703 Assembler::movdqu(dst, src);
2704 }
2705
2706 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2707 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2708 Assembler::movdqu(dst, src);
2709 }
2710
2711 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2712 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2713 Assembler::movdqu(dst, src);
2714 }
2715
2716 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2717 if (reachable(src)) {
2718 movdqu(dst, as_Address(src));
2719 } else {
2720 lea(scratchReg, src);
2721 movdqu(dst, Address(scratchReg, 0));
2722 }
2723 }
2724
2725 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2726 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2727 Assembler::vmovdqu(dst, src);
2728 }
2729
2730 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2731 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2732 Assembler::vmovdqu(dst, src);
2733 }
2734
2735 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2736 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2737 Assembler::vmovdqu(dst, src);
2738 }
2739
2740 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2741 if (reachable(src)) {
2742 vmovdqu(dst, as_Address(src));
2743 }
2744 else {
2745 lea(scratch_reg, src);
2746 vmovdqu(dst, Address(scratch_reg, 0));
2747 }
2748 }
2749
2750 void MacroAssembler::kmov(KRegister dst, Address src) {
2751 if (VM_Version::supports_avx512bw()) {
2752 kmovql(dst, src);
2753 } else {
2754 assert(VM_Version::supports_evex(), "");
2755 kmovwl(dst, src);
2756 }
2757 }
2758
2759 void MacroAssembler::kmov(Address dst, KRegister src) {
2760 if (VM_Version::supports_avx512bw()) {
2761 kmovql(dst, src);
2762 } else {
2763 assert(VM_Version::supports_evex(), "");
2764 kmovwl(dst, src);
2765 }
2766 }
2767
2768 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2769 if (VM_Version::supports_avx512bw()) {
2770 kmovql(dst, src);
2771 } else {
2772 assert(VM_Version::supports_evex(), "");
2773 kmovwl(dst, src);
2774 }
2775 }
2776
2777 void MacroAssembler::kmov(Register dst, KRegister src) {
2778 if (VM_Version::supports_avx512bw()) {
2779 kmovql(dst, src);
2780 } else {
2781 assert(VM_Version::supports_evex(), "");
2782 kmovwl(dst, src);
2783 }
2784 }
2785
2786 void MacroAssembler::kmov(KRegister dst, Register src) {
2787 if (VM_Version::supports_avx512bw()) {
2788 kmovql(dst, src);
2789 } else {
2790 assert(VM_Version::supports_evex(), "");
2791 kmovwl(dst, src);
2792 }
2793 }
2794
2795 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2796 if (reachable(src)) {
2797 kmovql(dst, as_Address(src));
2798 } else {
2799 lea(scratch_reg, src);
2800 kmovql(dst, Address(scratch_reg, 0));
2801 }
2802 }
2803
2804 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2805 if (reachable(src)) {
2806 kmovwl(dst, as_Address(src));
2807 } else {
2808 lea(scratch_reg, src);
2809 kmovwl(dst, Address(scratch_reg, 0));
2810 }
2811 }
2812
2813 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2814 int vector_len, Register scratch_reg) {
2815 if (reachable(src)) {
2816 if (mask == k0) {
2817 Assembler::evmovdqub(dst, as_Address(src), merge, vector_len);
2818 } else {
2819 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2820 }
2821 } else {
2822 lea(scratch_reg, src);
2823 if (mask == k0) {
2824 Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len);
2825 } else {
2826 Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2827 }
2828 }
2829 }
2830
2831 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2832 int vector_len, Register scratch_reg) {
2833 if (reachable(src)) {
2834 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2835 } else {
2836 lea(scratch_reg, src);
2837 Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2838 }
2839 }
2840
2841 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2842 int vector_len, Register scratch_reg) {
2843 if (reachable(src)) {
2844 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2845 } else {
2846 lea(scratch_reg, src);
2847 Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2848 }
2849 }
2850
2851 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2852 int vector_len, Register scratch_reg) {
2853 if (reachable(src)) {
2854 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2855 } else {
2856 lea(scratch_reg, src);
2857 Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2858 }
2859 }
2860
2861 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2862 if (reachable(src)) {
2863 Assembler::evmovdquq(dst, as_Address(src), vector_len);
2864 } else {
2865 lea(rscratch, src);
2866 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2867 }
2868 }
2869
2870 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2871 if (reachable(src)) {
2872 Assembler::movdqa(dst, as_Address(src));
2873 } else {
2874 lea(rscratch1, src);
2875 Assembler::movdqa(dst, Address(rscratch1, 0));
2876 }
2877 }
2878
2879 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2880 if (reachable(src)) {
2881 Assembler::movsd(dst, as_Address(src));
2882 } else {
2883 lea(rscratch1, src);
2884 Assembler::movsd(dst, Address(rscratch1, 0));
2885 }
2886 }
2887
2888 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2889 if (reachable(src)) {
2890 Assembler::movss(dst, as_Address(src));
2891 } else {
2892 lea(rscratch1, src);
2893 Assembler::movss(dst, Address(rscratch1, 0));
2894 }
2895 }
2896
2897 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2898 if (reachable(src)) {
2899 Assembler::mulsd(dst, as_Address(src));
2900 } else {
2901 lea(rscratch1, src);
2902 Assembler::mulsd(dst, Address(rscratch1, 0));
2903 }
2904 }
2905
2906 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2907 if (reachable(src)) {
2908 Assembler::mulss(dst, as_Address(src));
2909 } else {
2910 lea(rscratch1, src);
2911 Assembler::mulss(dst, Address(rscratch1, 0));
2912 }
2913 }
2914
2915 void MacroAssembler::null_check(Register reg, int offset) {
2916 if (needs_explicit_null_check(offset)) {
2917 // provoke OS NULL exception if reg = NULL by
2918 // accessing M[reg] w/o changing any (non-CC) registers
2919 // NOTE: cmpl is plenty here to provoke a segv
2920 cmpptr(rax, Address(reg, 0));
2921 // Note: should probably use testl(rax, Address(reg, 0));
2922 // may be shorter code (however, this version of
2923 // testl needs to be implemented first)
2924 } else {
2925 // nothing to do, (later) access of M[reg + offset]
2926 // will provoke OS NULL exception if reg = NULL
2927 }
2928 }
2929
2930 void MacroAssembler::os_breakpoint() {
2931 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2932 // (e.g., MSVC can't call ps() otherwise)
2933 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2934 }
2935
2936 void MacroAssembler::unimplemented(const char* what) {
2937 const char* buf = NULL;
2938 {
2939 ResourceMark rm;
2940 stringStream ss;
2941 ss.print("unimplemented: %s", what);
2942 buf = code_string(ss.as_string());
2943 }
2944 stop(buf);
2945 }
2946
2947 #ifdef _LP64
2948 #define XSTATE_BV 0x200
2949 #endif
2950
2951 void MacroAssembler::pop_CPU_state() {
2952 pop_FPU_state();
2953 pop_IU_state();
2954 }
2955
2956 void MacroAssembler::pop_FPU_state() {
2957 #ifndef _LP64
2958 frstor(Address(rsp, 0));
2959 #else
2960 fxrstor(Address(rsp, 0));
2961 #endif
2962 addptr(rsp, FPUStateSizeInWords * wordSize);
2963 }
2964
2965 void MacroAssembler::pop_IU_state() {
2966 popa();
2967 LP64_ONLY(addq(rsp, 8));
2968 popf();
2969 }
2970
2971 // Save Integer and Float state
2972 // Warning: Stack must be 16 byte aligned (64bit)
2973 void MacroAssembler::push_CPU_state() {
2974 push_IU_state();
2975 push_FPU_state();
2976 }
2977
2978 void MacroAssembler::push_FPU_state() {
2979 subptr(rsp, FPUStateSizeInWords * wordSize);
2980 #ifndef _LP64
2981 fnsave(Address(rsp, 0));
2982 fwait();
2983 #else
2984 fxsave(Address(rsp, 0));
2985 #endif // LP64
2986 }
2987
2988 void MacroAssembler::push_IU_state() {
2989 // Push flags first because pusha kills them
2990 pushf();
2991 // Make sure rsp stays 16-byte aligned
2992 LP64_ONLY(subq(rsp, 8));
2993 pusha();
2994 }
2995
2996 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2997 if (!java_thread->is_valid()) {
2998 java_thread = rdi;
2999 get_thread(java_thread);
3000 }
3001 // we must set sp to zero to clear frame
3002 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3003 // must clear fp, so that compiled frames are not confused; it is
3004 // possible that we need it only for debugging
3005 if (clear_fp) {
3006 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3007 }
3008 // Always clear the pc because it could have been set by make_walkable()
3009 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3010 vzeroupper();
3011 }
3012
3013 void MacroAssembler::restore_rax(Register tmp) {
3014 if (tmp == noreg) pop(rax);
3015 else if (tmp != rax) mov(rax, tmp);
3016 }
3017
3018 void MacroAssembler::round_to(Register reg, int modulus) {
3019 addptr(reg, modulus - 1);
3020 andptr(reg, -modulus);
3021 }
3022
3023 void MacroAssembler::save_rax(Register tmp) {
3024 if (tmp == noreg) push(rax);
3025 else if (tmp != rax) mov(tmp, rax);
3026 }
3027
3028 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
3029 if (at_return) {
3030 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
3031 // we may safely use rsp instead to perform the stack watermark check.
3032 cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
3033 jcc(Assembler::above, slow_path);
3034 return;
3035 }
3036 testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
3037 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3038 }
3039
3040 // Calls to C land
3041 //
3042 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3043 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3044 // has to be reset to 0. This is required to allow proper stack traversal.
3045 void MacroAssembler::set_last_Java_frame(Register java_thread,
3046 Register last_java_sp,
3047 Register last_java_fp,
3048 address last_java_pc) {
3049 vzeroupper();
3050 // determine java_thread register
3051 if (!java_thread->is_valid()) {
3052 java_thread = rdi;
3053 get_thread(java_thread);
3054 }
3055 // determine last_java_sp register
3056 if (!last_java_sp->is_valid()) {
3057 last_java_sp = rsp;
3058 }
3059
3060 // last_java_fp is optional
3061
3062 if (last_java_fp->is_valid()) {
3063 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3064 }
3065
3066 // last_java_pc is optional
3067
3068 if (last_java_pc != NULL) {
3069 lea(Address(java_thread,
3070 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3071 InternalAddress(last_java_pc));
3072
3073 }
3074 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3075 }
3076
3077 void MacroAssembler::shlptr(Register dst, int imm8) {
3078 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3079 }
3080
3081 void MacroAssembler::shrptr(Register dst, int imm8) {
3082 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3083 }
3084
3085 void MacroAssembler::sign_extend_byte(Register reg) {
3086 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3087 movsbl(reg, reg); // movsxb
3088 } else {
3089 shll(reg, 24);
3090 sarl(reg, 24);
3091 }
3092 }
3093
3094 void MacroAssembler::sign_extend_short(Register reg) {
3095 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3096 movswl(reg, reg); // movsxw
3097 } else {
3098 shll(reg, 16);
3099 sarl(reg, 16);
3100 }
3101 }
3102
3103 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3104 assert(reachable(src), "Address should be reachable");
3105 testl(dst, as_Address(src));
3106 }
3107
3108 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3109 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3110 Assembler::pcmpeqb(dst, src);
3111 }
3112
3113 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3114 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3115 Assembler::pcmpeqw(dst, src);
3116 }
3117
3118 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3119 assert((dst->encoding() < 16),"XMM register should be 0-15");
3120 Assembler::pcmpestri(dst, src, imm8);
3121 }
3122
3123 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3124 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3125 Assembler::pcmpestri(dst, src, imm8);
3126 }
3127
3128 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3129 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3130 Assembler::pmovzxbw(dst, src);
3131 }
3132
3133 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3134 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3135 Assembler::pmovzxbw(dst, src);
3136 }
3137
3138 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3139 assert((src->encoding() < 16),"XMM register should be 0-15");
3140 Assembler::pmovmskb(dst, src);
3141 }
3142
3143 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3144 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3145 Assembler::ptest(dst, src);
3146 }
3147
3148 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3149 if (reachable(src)) {
3150 Assembler::sqrtsd(dst, as_Address(src));
3151 } else {
3152 lea(rscratch1, src);
3153 Assembler::sqrtsd(dst, Address(rscratch1, 0));
3154 }
3155 }
3156
3157 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3158 if (reachable(src)) {
3159 Assembler::sqrtss(dst, as_Address(src));
3160 } else {
3161 lea(rscratch1, src);
3162 Assembler::sqrtss(dst, Address(rscratch1, 0));
3163 }
3164 }
3165
3166 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3167 if (reachable(src)) {
3168 Assembler::subsd(dst, as_Address(src));
3169 } else {
3170 lea(rscratch1, src);
3171 Assembler::subsd(dst, Address(rscratch1, 0));
3172 }
3173 }
3174
3175 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
3176 if (reachable(src)) {
3177 Assembler::roundsd(dst, as_Address(src), rmode);
3178 } else {
3179 lea(scratch_reg, src);
3180 Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
3181 }
3182 }
3183
3184 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3185 if (reachable(src)) {
3186 Assembler::subss(dst, as_Address(src));
3187 } else {
3188 lea(rscratch1, src);
3189 Assembler::subss(dst, Address(rscratch1, 0));
3190 }
3191 }
3192
3193 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3194 if (reachable(src)) {
3195 Assembler::ucomisd(dst, as_Address(src));
3196 } else {
3197 lea(rscratch1, src);
3198 Assembler::ucomisd(dst, Address(rscratch1, 0));
3199 }
3200 }
3201
3202 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3203 if (reachable(src)) {
3204 Assembler::ucomiss(dst, as_Address(src));
3205 } else {
3206 lea(rscratch1, src);
3207 Assembler::ucomiss(dst, Address(rscratch1, 0));
3208 }
3209 }
3210
3211 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3212 // Used in sign-bit flipping with aligned address.
3213 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3214 if (reachable(src)) {
3215 Assembler::xorpd(dst, as_Address(src));
3216 } else {
3217 lea(scratch_reg, src);
3218 Assembler::xorpd(dst, Address(scratch_reg, 0));
3219 }
3220 }
3221
3222 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3223 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3224 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3225 }
3226 else {
3227 Assembler::xorpd(dst, src);
3228 }
3229 }
3230
3231 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3232 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3233 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3234 } else {
3235 Assembler::xorps(dst, src);
3236 }
3237 }
3238
3239 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3240 // Used in sign-bit flipping with aligned address.
3241 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3242 if (reachable(src)) {
3243 Assembler::xorps(dst, as_Address(src));
3244 } else {
3245 lea(scratch_reg, src);
3246 Assembler::xorps(dst, Address(scratch_reg, 0));
3247 }
3248 }
3249
3250 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3251 // Used in sign-bit flipping with aligned address.
3252 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3253 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3254 if (reachable(src)) {
3255 Assembler::pshufb(dst, as_Address(src));
3256 } else {
3257 lea(rscratch1, src);
3258 Assembler::pshufb(dst, Address(rscratch1, 0));
3259 }
3260 }
3261
3262 // AVX 3-operands instructions
3263
3264 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3265 if (reachable(src)) {
3266 vaddsd(dst, nds, as_Address(src));
3267 } else {
3268 lea(rscratch1, src);
3269 vaddsd(dst, nds, Address(rscratch1, 0));
3270 }
3271 }
3272
3273 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3274 if (reachable(src)) {
3275 vaddss(dst, nds, as_Address(src));
3276 } else {
3277 lea(rscratch1, src);
3278 vaddss(dst, nds, Address(rscratch1, 0));
3279 }
3280 }
3281
3282 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3283 assert(UseAVX > 0, "requires some form of AVX");
3284 if (reachable(src)) {
3285 Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3286 } else {
3287 lea(rscratch, src);
3288 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3289 }
3290 }
3291
3292 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3293 assert(UseAVX > 0, "requires some form of AVX");
3294 if (reachable(src)) {
3295 Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3296 } else {
3297 lea(rscratch, src);
3298 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3299 }
3300 }
3301
3302 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3303 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3304 vandps(dst, nds, negate_field, vector_len);
3305 }
3306
3307 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3308 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3309 vandpd(dst, nds, negate_field, vector_len);
3310 }
3311
3312 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3313 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3314 Assembler::vpaddb(dst, nds, src, vector_len);
3315 }
3316
3317 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3318 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3319 Assembler::vpaddb(dst, nds, src, vector_len);
3320 }
3321
3322 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3323 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3324 Assembler::vpaddw(dst, nds, src, vector_len);
3325 }
3326
3327 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3328 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3329 Assembler::vpaddw(dst, nds, src, vector_len);
3330 }
3331
3332 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3333 if (reachable(src)) {
3334 Assembler::vpand(dst, nds, as_Address(src), vector_len);
3335 } else {
3336 lea(scratch_reg, src);
3337 Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3338 }
3339 }
3340
3341 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3342 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3343 Assembler::vpbroadcastw(dst, src, vector_len);
3344 }
3345
3346 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3347 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3348 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3349 }
3350
3351 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3352 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3353 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3354 }
3355
3356 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3357 AddressLiteral src, int vector_len, Register scratch_reg) {
3358 if (reachable(src)) {
3359 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3360 } else {
3361 lea(scratch_reg, src);
3362 Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3363 }
3364 }
3365
3366 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3367 int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3368 if (reachable(src)) {
3369 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3370 } else {
3371 lea(scratch_reg, src);
3372 Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3373 }
3374 }
3375
3376 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3377 int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3378 if (reachable(src)) {
3379 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3380 } else {
3381 lea(scratch_reg, src);
3382 Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3383 }
3384 }
3385
3386 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3387 int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3388 if (reachable(src)) {
3389 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3390 } else {
3391 lea(scratch_reg, src);
3392 Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3393 }
3394 }
3395
3396 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3397 int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3398 if (reachable(src)) {
3399 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3400 } else {
3401 lea(scratch_reg, src);
3402 Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3403 }
3404 }
3405
3406 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3407 if (width == Assembler::Q) {
3408 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3409 } else {
3410 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3411 }
3412 }
3413
3414 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg) {
3415 int eq_cond_enc = 0x29;
3416 int gt_cond_enc = 0x37;
3417 if (width != Assembler::Q) {
3418 eq_cond_enc = 0x74 + width;
3419 gt_cond_enc = 0x64 + width;
3420 }
3421 switch (cond) {
3422 case eq:
3423 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3424 break;
3425 case neq:
3426 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3427 vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3428 break;
3429 case le:
3430 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3431 vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3432 break;
3433 case nlt:
3434 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3435 vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3436 break;
3437 case lt:
3438 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3439 break;
3440 case nle:
3441 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3442 break;
3443 default:
3444 assert(false, "Should not reach here");
3445 }
3446 }
3447
3448 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3449 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3450 Assembler::vpmovzxbw(dst, src, vector_len);
3451 }
3452
3453 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3454 assert((src->encoding() < 16),"XMM register should be 0-15");
3455 Assembler::vpmovmskb(dst, src, vector_len);
3456 }
3457
3458 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3459 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3460 Assembler::vpmullw(dst, nds, src, vector_len);
3461 }
3462
3463 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3464 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3465 Assembler::vpmullw(dst, nds, src, vector_len);
3466 }
3467
3468 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3469 assert((UseAVX > 0), "AVX support is needed");
3470 if (reachable(src)) {
3471 Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3472 } else {
3473 lea(scratch_reg, src);
3474 Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3475 }
3476 }
3477
3478 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3479 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3480 Assembler::vpsubb(dst, nds, src, vector_len);
3481 }
3482
3483 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3484 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3485 Assembler::vpsubb(dst, nds, src, vector_len);
3486 }
3487
3488 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3489 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3490 Assembler::vpsubw(dst, nds, src, vector_len);
3491 }
3492
3493 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3494 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3495 Assembler::vpsubw(dst, nds, src, vector_len);
3496 }
3497
3498 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3499 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3500 Assembler::vpsraw(dst, nds, shift, vector_len);
3501 }
3502
3503 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3504 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3505 Assembler::vpsraw(dst, nds, shift, vector_len);
3506 }
3507
3508 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3509 assert(UseAVX > 2,"");
3510 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3511 vector_len = 2;
3512 }
3513 Assembler::evpsraq(dst, nds, shift, vector_len);
3514 }
3515
3516 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3517 assert(UseAVX > 2,"");
3518 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3519 vector_len = 2;
3520 }
3521 Assembler::evpsraq(dst, nds, shift, vector_len);
3522 }
3523
3524 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3525 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3526 Assembler::vpsrlw(dst, nds, shift, vector_len);
3527 }
3528
3529 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3530 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3531 Assembler::vpsrlw(dst, nds, shift, vector_len);
3532 }
3533
3534 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3535 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3536 Assembler::vpsllw(dst, nds, shift, vector_len);
3537 }
3538
3539 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3540 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3541 Assembler::vpsllw(dst, nds, shift, vector_len);
3542 }
3543
3544 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3545 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3546 Assembler::vptest(dst, src);
3547 }
3548
3549 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3550 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3551 Assembler::punpcklbw(dst, src);
3552 }
3553
3554 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3555 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3556 Assembler::pshufd(dst, src, mode);
3557 }
3558
3559 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3560 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3561 Assembler::pshuflw(dst, src, mode);
3562 }
3563
3564 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3565 if (reachable(src)) {
3566 vandpd(dst, nds, as_Address(src), vector_len);
3567 } else {
3568 lea(scratch_reg, src);
3569 vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3570 }
3571 }
3572
3573 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3574 if (reachable(src)) {
3575 vandps(dst, nds, as_Address(src), vector_len);
3576 } else {
3577 lea(scratch_reg, src);
3578 vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3579 }
3580 }
3581
3582 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3583 bool merge, int vector_len, Register scratch_reg) {
3584 if (reachable(src)) {
3585 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3586 } else {
3587 lea(scratch_reg, src);
3588 Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3589 }
3590 }
3591
3592 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3593 if (reachable(src)) {
3594 vdivsd(dst, nds, as_Address(src));
3595 } else {
3596 lea(rscratch1, src);
3597 vdivsd(dst, nds, Address(rscratch1, 0));
3598 }
3599 }
3600
3601 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3602 if (reachable(src)) {
3603 vdivss(dst, nds, as_Address(src));
3604 } else {
3605 lea(rscratch1, src);
3606 vdivss(dst, nds, Address(rscratch1, 0));
3607 }
3608 }
3609
3610 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3611 if (reachable(src)) {
3612 vmulsd(dst, nds, as_Address(src));
3613 } else {
3614 lea(rscratch1, src);
3615 vmulsd(dst, nds, Address(rscratch1, 0));
3616 }
3617 }
3618
3619 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3620 if (reachable(src)) {
3621 vmulss(dst, nds, as_Address(src));
3622 } else {
3623 lea(rscratch1, src);
3624 vmulss(dst, nds, Address(rscratch1, 0));
3625 }
3626 }
3627
3628 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3629 if (reachable(src)) {
3630 vsubsd(dst, nds, as_Address(src));
3631 } else {
3632 lea(rscratch1, src);
3633 vsubsd(dst, nds, Address(rscratch1, 0));
3634 }
3635 }
3636
3637 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3638 if (reachable(src)) {
3639 vsubss(dst, nds, as_Address(src));
3640 } else {
3641 lea(rscratch1, src);
3642 vsubss(dst, nds, Address(rscratch1, 0));
3643 }
3644 }
3645
3646 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3647 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3648 vxorps(dst, nds, src, Assembler::AVX_128bit);
3649 }
3650
3651 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3652 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3653 vxorpd(dst, nds, src, Assembler::AVX_128bit);
3654 }
3655
3656 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3657 if (reachable(src)) {
3658 vxorpd(dst, nds, as_Address(src), vector_len);
3659 } else {
3660 lea(scratch_reg, src);
3661 vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3662 }
3663 }
3664
3665 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3666 if (reachable(src)) {
3667 vxorps(dst, nds, as_Address(src), vector_len);
3668 } else {
3669 lea(scratch_reg, src);
3670 vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3671 }
3672 }
3673
3674 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3675 if (UseAVX > 1 || (vector_len < 1)) {
3676 if (reachable(src)) {
3677 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3678 } else {
3679 lea(scratch_reg, src);
3680 Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3681 }
3682 }
3683 else {
3684 MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3685 }
3686 }
3687
3688 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3689 if (reachable(src)) {
3690 Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3691 } else {
3692 lea(scratch_reg, src);
3693 Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3694 }
3695 }
3696
3697 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3698 const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3699 STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3700 // The inverted mask is sign-extended
3701 andptr(possibly_jweak, inverted_jweak_mask);
3702 }
3703
3704 void MacroAssembler::resolve_jobject(Register value,
3705 Register thread,
3706 Register tmp) {
3707 assert_different_registers(value, thread, tmp);
3708 Label done, not_weak;
3709 testptr(value, value);
3710 jcc(Assembler::zero, done); // Use NULL as-is.
3711 testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3712 jcc(Assembler::zero, not_weak);
3713 // Resolve jweak.
3714 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3715 value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3716 verify_oop(value);
3717 jmp(done);
3718 bind(not_weak);
3719 // Resolve (untagged) jobject.
3720 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3721 verify_oop(value);
3722 bind(done);
3723 }
3724
3725 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3726 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3727 }
3728
3729 // Force generation of a 4 byte immediate value even if it fits into 8bit
3730 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3731 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3732 }
3733
3734 void MacroAssembler::subptr(Register dst, Register src) {
3735 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3736 }
3737
3738 // C++ bool manipulation
3739 void MacroAssembler::testbool(Register dst) {
3740 if(sizeof(bool) == 1)
3741 testb(dst, 0xff);
3742 else if(sizeof(bool) == 2) {
3743 // testw implementation needed for two byte bools
3744 ShouldNotReachHere();
3745 } else if(sizeof(bool) == 4)
3746 testl(dst, dst);
3747 else
3748 // unsupported
3749 ShouldNotReachHere();
3750 }
3751
3752 void MacroAssembler::testptr(Register dst, Register src) {
3753 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3754 }
3755
3756 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3757 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3758 Register var_size_in_bytes,
3759 int con_size_in_bytes,
3760 Register t1,
3761 Register t2,
3762 Label& slow_case) {
3763 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3764 bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3765 }
3766
3767 // Defines obj, preserves var_size_in_bytes
3768 void MacroAssembler::eden_allocate(Register thread, Register obj,
3769 Register var_size_in_bytes,
3770 int con_size_in_bytes,
3771 Register t1,
3772 Label& slow_case) {
3773 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3774 bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3775 }
3776
3777 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3778 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3779 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3780 assert((offset_in_bytes & (BytesPerInt - 1)) == 0, "offset must be a multiple of BytesPerInt");
3781 Label done;
3782
3783 testptr(length_in_bytes, length_in_bytes);
3784 jcc(Assembler::zero, done);
3785
3786 // Emit single 32bit store to clear leading bytes, if necessary.
3787 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
3788 #ifdef _LP64
3789 if (!is_aligned(offset_in_bytes, BytesPerWord)) {
3790 movl(Address(address, offset_in_bytes), temp);
3791 offset_in_bytes += BytesPerInt;
3792 decrement(length_in_bytes, BytesPerInt);
3793 }
3794 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3795 testptr(length_in_bytes, length_in_bytes);
3796 jcc(Assembler::zero, done);
3797 #endif
3798
3799 // initialize topmost word, divide index by 2, check if odd and test if zero
3800 // note: for the remaining code to work, index must be a multiple of BytesPerWord
3801 #ifdef ASSERT
3802 {
3803 Label L;
3804 testptr(length_in_bytes, BytesPerWord - 1);
3805 jcc(Assembler::zero, L);
3806 stop("length must be a multiple of BytesPerWord");
3807 bind(L);
3808 }
3809 #endif
3810 Register index = length_in_bytes;
3811 if (UseIncDec) {
3812 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
3813 } else {
3814 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
3815 shrptr(index, 1);
3816 }
3817 #ifndef _LP64
3818 // index could have not been a multiple of 8 (i.e., bit 2 was set)
3819 {
3820 Label even;
3821 // note: if index was a multiple of 8, then it cannot
3822 // be 0 now otherwise it must have been 0 before
3823 // => if it is even, we don't need to check for 0 again
3824 jcc(Assembler::carryClear, even);
3825 // clear topmost word (no jump would be needed if conditional assignment worked here)
3826 movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3827 // index could be 0 now, must check again
3828 jcc(Assembler::zero, done);
3829 bind(even);
3830 }
3831 #endif // !_LP64
3832 // initialize remaining object fields: index is a multiple of 2 now
3833 {
3834 Label loop;
3835 bind(loop);
3836 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3837 NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3838 decrement(index);
3839 jcc(Assembler::notZero, loop);
3840 }
3841
3842 bind(done);
3843 }
3844
3845 // Look up the method for a megamorphic invokeinterface call.
3846 // The target method is determined by <intf_klass, itable_index>.
3847 // The receiver klass is in recv_klass.
3848 // On success, the result will be in method_result, and execution falls through.
3849 // On failure, execution transfers to the given label.
3850 void MacroAssembler::lookup_interface_method(Register recv_klass,
3851 Register intf_klass,
3852 RegisterOrConstant itable_index,
3853 Register method_result,
3854 Register scan_temp,
3855 Label& L_no_such_interface,
3856 bool return_method) {
3857 assert_different_registers(recv_klass, intf_klass, scan_temp);
3858 assert_different_registers(method_result, intf_klass, scan_temp);
3859 assert(recv_klass != method_result || !return_method,
3860 "recv_klass can be destroyed when method isn't needed");
3861
3862 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3863 "caller must use same register for non-constant itable index as for method");
3864
3865 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3866 int vtable_base = in_bytes(Klass::vtable_start_offset());
3867 int itentry_off = itableMethodEntry::method_offset_in_bytes();
3868 int scan_step = itableOffsetEntry::size() * wordSize;
3869 int vte_size = vtableEntry::size_in_bytes();
3870 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3871 assert(vte_size == wordSize, "else adjust times_vte_scale");
3872
3873 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3874
3875 // %%% Could store the aligned, prescaled offset in the klassoop.
3876 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3877
3878 if (return_method) {
3879 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3880 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3881 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3882 }
3883
3884 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3885 // if (scan->interface() == intf) {
3886 // result = (klass + scan->offset() + itable_index);
3887 // }
3888 // }
3889 Label search, found_method;
3890
3891 for (int peel = 1; peel >= 0; peel--) {
3892 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3893 cmpptr(intf_klass, method_result);
3894
3895 if (peel) {
3896 jccb(Assembler::equal, found_method);
3897 } else {
3898 jccb(Assembler::notEqual, search);
3899 // (invert the test to fall through to found_method...)
3900 }
3901
3902 if (!peel) break;
3903
3904 bind(search);
3905
3906 // Check that the previous entry is non-null. A null entry means that
3907 // the receiver class doesn't implement the interface, and wasn't the
3908 // same as when the caller was compiled.
3909 testptr(method_result, method_result);
3910 jcc(Assembler::zero, L_no_such_interface);
3911 addptr(scan_temp, scan_step);
3912 }
3913
3914 bind(found_method);
3915
3916 if (return_method) {
3917 // Got a hit.
3918 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3919 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3920 }
3921 }
3922
3923
3924 // virtual method calling
3925 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3926 RegisterOrConstant vtable_index,
3927 Register method_result) {
3928 const int base = in_bytes(Klass::vtable_start_offset());
3929 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3930 Address vtable_entry_addr(recv_klass,
3931 vtable_index, Address::times_ptr,
3932 base + vtableEntry::method_offset_in_bytes());
3933 movptr(method_result, vtable_entry_addr);
3934 }
3935
3936
3937 void MacroAssembler::check_klass_subtype(Register sub_klass,
3938 Register super_klass,
3939 Register temp_reg,
3940 Label& L_success) {
3941 Label L_failure;
3942 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
3943 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3944 bind(L_failure);
3945 }
3946
3947
3948 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3949 Register super_klass,
3950 Register temp_reg,
3951 Label* L_success,
3952 Label* L_failure,
3953 Label* L_slow_path,
3954 RegisterOrConstant super_check_offset) {
3955 assert_different_registers(sub_klass, super_klass, temp_reg);
3956 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3957 if (super_check_offset.is_register()) {
3958 assert_different_registers(sub_klass, super_klass,
3959 super_check_offset.as_register());
3960 } else if (must_load_sco) {
3961 assert(temp_reg != noreg, "supply either a temp or a register offset");
3962 }
3963
3964 Label L_fallthrough;
3965 int label_nulls = 0;
3966 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
3967 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
3968 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3969 assert(label_nulls <= 1, "at most one NULL in the batch");
3970
3971 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3972 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3973 Address super_check_offset_addr(super_klass, sco_offset);
3974
3975 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3976 // range of a jccb. If this routine grows larger, reconsider at
3977 // least some of these.
3978 #define local_jcc(assembler_cond, label) \
3979 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
3980 else jcc( assembler_cond, label) /*omit semi*/
3981
3982 // Hacked jmp, which may only be used just before L_fallthrough.
3983 #define final_jmp(label) \
3984 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
3985 else jmp(label) /*omit semi*/
3986
3987 // If the pointers are equal, we are done (e.g., String[] elements).
3988 // This self-check enables sharing of secondary supertype arrays among
3989 // non-primary types such as array-of-interface. Otherwise, each such
3990 // type would need its own customized SSA.
3991 // We move this check to the front of the fast path because many
3992 // type checks are in fact trivially successful in this manner,
3993 // so we get a nicely predicted branch right at the start of the check.
3994 cmpptr(sub_klass, super_klass);
3995 local_jcc(Assembler::equal, *L_success);
3996
3997 // Check the supertype display:
3998 if (must_load_sco) {
3999 // Positive movl does right thing on LP64.
4000 movl(temp_reg, super_check_offset_addr);
4001 super_check_offset = RegisterOrConstant(temp_reg);
4002 }
4003 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4004 cmpptr(super_klass, super_check_addr); // load displayed supertype
4005
4006 // This check has worked decisively for primary supers.
4007 // Secondary supers are sought in the super_cache ('super_cache_addr').
4008 // (Secondary supers are interfaces and very deeply nested subtypes.)
4009 // This works in the same check above because of a tricky aliasing
4010 // between the super_cache and the primary super display elements.
4011 // (The 'super_check_addr' can address either, as the case requires.)
4012 // Note that the cache is updated below if it does not help us find
4013 // what we need immediately.
4014 // So if it was a primary super, we can just fail immediately.
4015 // Otherwise, it's the slow path for us (no success at this point).
4016
4017 if (super_check_offset.is_register()) {
4018 local_jcc(Assembler::equal, *L_success);
4019 cmpl(super_check_offset.as_register(), sc_offset);
4020 if (L_failure == &L_fallthrough) {
4021 local_jcc(Assembler::equal, *L_slow_path);
4022 } else {
4023 local_jcc(Assembler::notEqual, *L_failure);
4024 final_jmp(*L_slow_path);
4025 }
4026 } else if (super_check_offset.as_constant() == sc_offset) {
4027 // Need a slow path; fast failure is impossible.
4028 if (L_slow_path == &L_fallthrough) {
4029 local_jcc(Assembler::equal, *L_success);
4030 } else {
4031 local_jcc(Assembler::notEqual, *L_slow_path);
4032 final_jmp(*L_success);
4033 }
4034 } else {
4035 // No slow path; it's a fast decision.
4036 if (L_failure == &L_fallthrough) {
4037 local_jcc(Assembler::equal, *L_success);
4038 } else {
4039 local_jcc(Assembler::notEqual, *L_failure);
4040 final_jmp(*L_success);
4041 }
4042 }
4043
4044 bind(L_fallthrough);
4045
4046 #undef local_jcc
4047 #undef final_jmp
4048 }
4049
4050
4051 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4052 Register super_klass,
4053 Register temp_reg,
4054 Register temp2_reg,
4055 Label* L_success,
4056 Label* L_failure,
4057 bool set_cond_codes) {
4058 assert_different_registers(sub_klass, super_klass, temp_reg);
4059 if (temp2_reg != noreg)
4060 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4061 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4062
4063 Label L_fallthrough;
4064 int label_nulls = 0;
4065 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
4066 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
4067 assert(label_nulls <= 1, "at most one NULL in the batch");
4068
4069 // a couple of useful fields in sub_klass:
4070 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4071 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4072 Address secondary_supers_addr(sub_klass, ss_offset);
4073 Address super_cache_addr( sub_klass, sc_offset);
4074
4075 // Do a linear scan of the secondary super-klass chain.
4076 // This code is rarely used, so simplicity is a virtue here.
4077 // The repne_scan instruction uses fixed registers, which we must spill.
4078 // Don't worry too much about pre-existing connections with the input regs.
4079
4080 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4081 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4082
4083 // Get super_klass value into rax (even if it was in rdi or rcx).
4084 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4085 if (super_klass != rax) {
4086 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4087 mov(rax, super_klass);
4088 }
4089 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4090 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4091
4092 #ifndef PRODUCT
4093 int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4094 ExternalAddress pst_counter_addr((address) pst_counter);
4095 NOT_LP64( incrementl(pst_counter_addr) );
4096 LP64_ONLY( lea(rcx, pst_counter_addr) );
4097 LP64_ONLY( incrementl(Address(rcx, 0)) );
4098 #endif //PRODUCT
4099
4100 // We will consult the secondary-super array.
4101 movptr(rdi, secondary_supers_addr);
4102 // Load the array length. (Positive movl does right thing on LP64.)
4103 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4104 // Skip to start of data.
4105 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4106
4107 // Scan RCX words at [RDI] for an occurrence of RAX.
4108 // Set NZ/Z based on last compare.
4109 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4110 // not change flags (only scas instruction which is repeated sets flags).
4111 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4112
4113 testptr(rax,rax); // Set Z = 0
4114 repne_scan();
4115
4116 // Unspill the temp. registers:
4117 if (pushed_rdi) pop(rdi);
4118 if (pushed_rcx) pop(rcx);
4119 if (pushed_rax) pop(rax);
4120
4121 if (set_cond_codes) {
4122 // Special hack for the AD files: rdi is guaranteed non-zero.
4123 assert(!pushed_rdi, "rdi must be left non-NULL");
4124 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4125 }
4126
4127 if (L_failure == &L_fallthrough)
4128 jccb(Assembler::notEqual, *L_failure);
4129 else jcc(Assembler::notEqual, *L_failure);
4130
4131 // Success. Cache the super we found and proceed in triumph.
4132 movptr(super_cache_addr, super_klass);
4133
4134 if (L_success != &L_fallthrough) {
4135 jmp(*L_success);
4136 }
4137
4138 #undef IS_A_TEMP
4139
4140 bind(L_fallthrough);
4141 }
4142
4143 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4144 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4145
4146 Label L_fallthrough;
4147 if (L_fast_path == NULL) {
4148 L_fast_path = &L_fallthrough;
4149 } else if (L_slow_path == NULL) {
4150 L_slow_path = &L_fallthrough;
4151 }
4152
4153 // Fast path check: class is fully initialized
4154 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4155 jcc(Assembler::equal, *L_fast_path);
4156
4157 // Fast path check: current thread is initializer thread
4158 cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4159 if (L_slow_path == &L_fallthrough) {
4160 jcc(Assembler::equal, *L_fast_path);
4161 bind(*L_slow_path);
4162 } else if (L_fast_path == &L_fallthrough) {
4163 jcc(Assembler::notEqual, *L_slow_path);
4164 bind(*L_fast_path);
4165 } else {
4166 Unimplemented();
4167 }
4168 }
4169
4170 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4171 if (VM_Version::supports_cmov()) {
4172 cmovl(cc, dst, src);
4173 } else {
4174 Label L;
4175 jccb(negate_condition(cc), L);
4176 movl(dst, src);
4177 bind(L);
4178 }
4179 }
4180
4181 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4182 if (VM_Version::supports_cmov()) {
4183 cmovl(cc, dst, src);
4184 } else {
4185 Label L;
4186 jccb(negate_condition(cc), L);
4187 movl(dst, src);
4188 bind(L);
4189 }
4190 }
4191
4192 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4193 if (!VerifyOops) return;
4194
4195 // Pass register number to verify_oop_subroutine
4196 const char* b = NULL;
4197 {
4198 ResourceMark rm;
4199 stringStream ss;
4200 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4201 b = code_string(ss.as_string());
4202 }
4203 BLOCK_COMMENT("verify_oop {");
4204 #ifdef _LP64
4205 push(rscratch1); // save r10, trashed by movptr()
4206 #endif
4207 push(rax); // save rax,
4208 push(reg); // pass register argument
4209 ExternalAddress buffer((address) b);
4210 // avoid using pushptr, as it modifies scratch registers
4211 // and our contract is not to modify anything
4212 movptr(rax, buffer.addr());
4213 push(rax);
4214 // call indirectly to solve generation ordering problem
4215 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4216 call(rax);
4217 // Caller pops the arguments (oop, message) and restores rax, r10
4218 BLOCK_COMMENT("} verify_oop");
4219 }
4220
4221 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4222 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4223 vpternlogd(dst, 0xFF, dst, dst, vector_len);
4224 } else {
4225 assert(UseAVX > 0, "");
4226 vpcmpeqb(dst, dst, dst, vector_len);
4227 }
4228 }
4229
4230 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4231 int extra_slot_offset) {
4232 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4233 int stackElementSize = Interpreter::stackElementSize;
4234 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4235 #ifdef ASSERT
4236 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4237 assert(offset1 - offset == stackElementSize, "correct arithmetic");
4238 #endif
4239 Register scale_reg = noreg;
4240 Address::ScaleFactor scale_factor = Address::no_scale;
4241 if (arg_slot.is_constant()) {
4242 offset += arg_slot.as_constant() * stackElementSize;
4243 } else {
4244 scale_reg = arg_slot.as_register();
4245 scale_factor = Address::times(stackElementSize);
4246 }
4247 offset += wordSize; // return PC is on stack
4248 return Address(rsp, scale_reg, scale_factor, offset);
4249 }
4250
4251 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4252 if (!VerifyOops) return;
4253
4254 // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4255 // Pass register number to verify_oop_subroutine
4256 const char* b = NULL;
4257 {
4258 ResourceMark rm;
4259 stringStream ss;
4260 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4261 b = code_string(ss.as_string());
4262 }
4263 #ifdef _LP64
4264 push(rscratch1); // save r10, trashed by movptr()
4265 #endif
4266 push(rax); // save rax,
4267 // addr may contain rsp so we will have to adjust it based on the push
4268 // we just did (and on 64 bit we do two pushes)
4269 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4270 // stores rax into addr which is backwards of what was intended.
4271 if (addr.uses(rsp)) {
4272 lea(rax, addr);
4273 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4274 } else {
4275 pushptr(addr);
4276 }
4277
4278 ExternalAddress buffer((address) b);
4279 // pass msg argument
4280 // avoid using pushptr, as it modifies scratch registers
4281 // and our contract is not to modify anything
4282 movptr(rax, buffer.addr());
4283 push(rax);
4284
4285 // call indirectly to solve generation ordering problem
4286 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4287 call(rax);
4288 // Caller pops the arguments (addr, message) and restores rax, r10.
4289 }
4290
4291 void MacroAssembler::verify_tlab() {
4292 #ifdef ASSERT
4293 if (UseTLAB && VerifyOops) {
4294 Label next, ok;
4295 Register t1 = rsi;
4296 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4297
4298 push(t1);
4299 NOT_LP64(push(thread_reg));
4300 NOT_LP64(get_thread(thread_reg));
4301
4302 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4303 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4304 jcc(Assembler::aboveEqual, next);
4305 STOP("assert(top >= start)");
4306 should_not_reach_here();
4307
4308 bind(next);
4309 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4310 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4311 jcc(Assembler::aboveEqual, ok);
4312 STOP("assert(top <= end)");
4313 should_not_reach_here();
4314
4315 bind(ok);
4316 NOT_LP64(pop(thread_reg));
4317 pop(t1);
4318 }
4319 #endif
4320 }
4321
4322 class ControlWord {
4323 public:
4324 int32_t _value;
4325
4326 int rounding_control() const { return (_value >> 10) & 3 ; }
4327 int precision_control() const { return (_value >> 8) & 3 ; }
4328 bool precision() const { return ((_value >> 5) & 1) != 0; }
4329 bool underflow() const { return ((_value >> 4) & 1) != 0; }
4330 bool overflow() const { return ((_value >> 3) & 1) != 0; }
4331 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
4332 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
4333 bool invalid() const { return ((_value >> 0) & 1) != 0; }
4334
4335 void print() const {
4336 // rounding control
4337 const char* rc;
4338 switch (rounding_control()) {
4339 case 0: rc = "round near"; break;
4340 case 1: rc = "round down"; break;
4341 case 2: rc = "round up "; break;
4342 case 3: rc = "chop "; break;
4343 default:
4344 rc = NULL; // silence compiler warnings
4345 fatal("Unknown rounding control: %d", rounding_control());
4346 };
4347 // precision control
4348 const char* pc;
4349 switch (precision_control()) {
4350 case 0: pc = "24 bits "; break;
4351 case 1: pc = "reserved"; break;
4352 case 2: pc = "53 bits "; break;
4353 case 3: pc = "64 bits "; break;
4354 default:
4355 pc = NULL; // silence compiler warnings
4356 fatal("Unknown precision control: %d", precision_control());
4357 };
4358 // flags
4359 char f[9];
4360 f[0] = ' ';
4361 f[1] = ' ';
4362 f[2] = (precision ()) ? 'P' : 'p';
4363 f[3] = (underflow ()) ? 'U' : 'u';
4364 f[4] = (overflow ()) ? 'O' : 'o';
4365 f[5] = (zero_divide ()) ? 'Z' : 'z';
4366 f[6] = (denormalized()) ? 'D' : 'd';
4367 f[7] = (invalid ()) ? 'I' : 'i';
4368 f[8] = '\x0';
4369 // output
4370 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4371 }
4372
4373 };
4374
4375 class StatusWord {
4376 public:
4377 int32_t _value;
4378
4379 bool busy() const { return ((_value >> 15) & 1) != 0; }
4380 bool C3() const { return ((_value >> 14) & 1) != 0; }
4381 bool C2() const { return ((_value >> 10) & 1) != 0; }
4382 bool C1() const { return ((_value >> 9) & 1) != 0; }
4383 bool C0() const { return ((_value >> 8) & 1) != 0; }
4384 int top() const { return (_value >> 11) & 7 ; }
4385 bool error_status() const { return ((_value >> 7) & 1) != 0; }
4386 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
4387 bool precision() const { return ((_value >> 5) & 1) != 0; }
4388 bool underflow() const { return ((_value >> 4) & 1) != 0; }
4389 bool overflow() const { return ((_value >> 3) & 1) != 0; }
4390 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
4391 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
4392 bool invalid() const { return ((_value >> 0) & 1) != 0; }
4393
4394 void print() const {
4395 // condition codes
4396 char c[5];
4397 c[0] = (C3()) ? '3' : '-';
4398 c[1] = (C2()) ? '2' : '-';
4399 c[2] = (C1()) ? '1' : '-';
4400 c[3] = (C0()) ? '0' : '-';
4401 c[4] = '\x0';
4402 // flags
4403 char f[9];
4404 f[0] = (error_status()) ? 'E' : '-';
4405 f[1] = (stack_fault ()) ? 'S' : '-';
4406 f[2] = (precision ()) ? 'P' : '-';
4407 f[3] = (underflow ()) ? 'U' : '-';
4408 f[4] = (overflow ()) ? 'O' : '-';
4409 f[5] = (zero_divide ()) ? 'Z' : '-';
4410 f[6] = (denormalized()) ? 'D' : '-';
4411 f[7] = (invalid ()) ? 'I' : '-';
4412 f[8] = '\x0';
4413 // output
4414 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
4415 }
4416
4417 };
4418
4419 class TagWord {
4420 public:
4421 int32_t _value;
4422
4423 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
4424
4425 void print() const {
4426 printf("%04x", _value & 0xFFFF);
4427 }
4428
4429 };
4430
4431 class FPU_Register {
4432 public:
4433 int32_t _m0;
4434 int32_t _m1;
4435 int16_t _ex;
4436
4437 bool is_indefinite() const {
4438 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4439 }
4440
4441 void print() const {
4442 char sign = (_ex < 0) ? '-' : '+';
4443 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
4444 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
4445 };
4446
4447 };
4448
4449 class FPU_State {
4450 public:
4451 enum {
4452 register_size = 10,
4453 number_of_registers = 8,
4454 register_mask = 7
4455 };
4456
4457 ControlWord _control_word;
4458 StatusWord _status_word;
4459 TagWord _tag_word;
4460 int32_t _error_offset;
4461 int32_t _error_selector;
4462 int32_t _data_offset;
4463 int32_t _data_selector;
4464 int8_t _register[register_size * number_of_registers];
4465
4466 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4467 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
4468
4469 const char* tag_as_string(int tag) const {
4470 switch (tag) {
4471 case 0: return "valid";
4472 case 1: return "zero";
4473 case 2: return "special";
4474 case 3: return "empty";
4475 }
4476 ShouldNotReachHere();
4477 return NULL;
4478 }
4479
4480 void print() const {
4481 // print computation registers
4482 { int t = _status_word.top();
4483 for (int i = 0; i < number_of_registers; i++) {
4484 int j = (i - t) & register_mask;
4485 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4486 st(j)->print();
4487 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4488 }
4489 }
4490 printf("\n");
4491 // print control registers
4492 printf("ctrl = "); _control_word.print(); printf("\n");
4493 printf("stat = "); _status_word .print(); printf("\n");
4494 printf("tags = "); _tag_word .print(); printf("\n");
4495 }
4496
4497 };
4498
4499 class Flag_Register {
4500 public:
4501 int32_t _value;
4502
4503 bool overflow() const { return ((_value >> 11) & 1) != 0; }
4504 bool direction() const { return ((_value >> 10) & 1) != 0; }
4505 bool sign() const { return ((_value >> 7) & 1) != 0; }
4506 bool zero() const { return ((_value >> 6) & 1) != 0; }
4507 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
4508 bool parity() const { return ((_value >> 2) & 1) != 0; }
4509 bool carry() const { return ((_value >> 0) & 1) != 0; }
4510
4511 void print() const {
4512 // flags
4513 char f[8];
4514 f[0] = (overflow ()) ? 'O' : '-';
4515 f[1] = (direction ()) ? 'D' : '-';
4516 f[2] = (sign ()) ? 'S' : '-';
4517 f[3] = (zero ()) ? 'Z' : '-';
4518 f[4] = (auxiliary_carry()) ? 'A' : '-';
4519 f[5] = (parity ()) ? 'P' : '-';
4520 f[6] = (carry ()) ? 'C' : '-';
4521 f[7] = '\x0';
4522 // output
4523 printf("%08x flags = %s", _value, f);
4524 }
4525
4526 };
4527
4528 class IU_Register {
4529 public:
4530 int32_t _value;
4531
4532 void print() const {
4533 printf("%08x %11d", _value, _value);
4534 }
4535
4536 };
4537
4538 class IU_State {
4539 public:
4540 Flag_Register _eflags;
4541 IU_Register _rdi;
4542 IU_Register _rsi;
4543 IU_Register _rbp;
4544 IU_Register _rsp;
4545 IU_Register _rbx;
4546 IU_Register _rdx;
4547 IU_Register _rcx;
4548 IU_Register _rax;
4549
4550 void print() const {
4551 // computation registers
4552 printf("rax, = "); _rax.print(); printf("\n");
4553 printf("rbx, = "); _rbx.print(); printf("\n");
4554 printf("rcx = "); _rcx.print(); printf("\n");
4555 printf("rdx = "); _rdx.print(); printf("\n");
4556 printf("rdi = "); _rdi.print(); printf("\n");
4557 printf("rsi = "); _rsi.print(); printf("\n");
4558 printf("rbp, = "); _rbp.print(); printf("\n");
4559 printf("rsp = "); _rsp.print(); printf("\n");
4560 printf("\n");
4561 // control registers
4562 printf("flgs = "); _eflags.print(); printf("\n");
4563 }
4564 };
4565
4566
4567 class CPU_State {
4568 public:
4569 FPU_State _fpu_state;
4570 IU_State _iu_state;
4571
4572 void print() const {
4573 printf("--------------------------------------------------\n");
4574 _iu_state .print();
4575 printf("\n");
4576 _fpu_state.print();
4577 printf("--------------------------------------------------\n");
4578 }
4579
4580 };
4581
4582
4583 static void _print_CPU_state(CPU_State* state) {
4584 state->print();
4585 };
4586
4587
4588 void MacroAssembler::print_CPU_state() {
4589 push_CPU_state();
4590 push(rsp); // pass CPU state
4591 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4592 addptr(rsp, wordSize); // discard argument
4593 pop_CPU_state();
4594 }
4595
4596
4597 #ifndef _LP64
4598 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4599 static int counter = 0;
4600 FPU_State* fs = &state->_fpu_state;
4601 counter++;
4602 // For leaf calls, only verify that the top few elements remain empty.
4603 // We only need 1 empty at the top for C2 code.
4604 if( stack_depth < 0 ) {
4605 if( fs->tag_for_st(7) != 3 ) {
4606 printf("FPR7 not empty\n");
4607 state->print();
4608 assert(false, "error");
4609 return false;
4610 }
4611 return true; // All other stack states do not matter
4612 }
4613
4614 assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4615 "bad FPU control word");
4616
4617 // compute stack depth
4618 int i = 0;
4619 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++;
4620 int d = i;
4621 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4622 // verify findings
4623 if (i != FPU_State::number_of_registers) {
4624 // stack not contiguous
4625 printf("%s: stack not contiguous at ST%d\n", s, i);
4626 state->print();
4627 assert(false, "error");
4628 return false;
4629 }
4630 // check if computed stack depth corresponds to expected stack depth
4631 if (stack_depth < 0) {
4632 // expected stack depth is -stack_depth or less
4633 if (d > -stack_depth) {
4634 // too many elements on the stack
4635 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4636 state->print();
4637 assert(false, "error");
4638 return false;
4639 }
4640 } else {
4641 // expected stack depth is stack_depth
4642 if (d != stack_depth) {
4643 // wrong stack depth
4644 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4645 state->print();
4646 assert(false, "error");
4647 return false;
4648 }
4649 }
4650 // everything is cool
4651 return true;
4652 }
4653
4654 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4655 if (!VerifyFPU) return;
4656 push_CPU_state();
4657 push(rsp); // pass CPU state
4658 ExternalAddress msg((address) s);
4659 // pass message string s
4660 pushptr(msg.addr());
4661 push(stack_depth); // pass stack depth
4662 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4663 addptr(rsp, 3 * wordSize); // discard arguments
4664 // check for error
4665 { Label L;
4666 testl(rax, rax);
4667 jcc(Assembler::notZero, L);
4668 int3(); // break if error condition
4669 bind(L);
4670 }
4671 pop_CPU_state();
4672 }
4673 #endif // _LP64
4674
4675 void MacroAssembler::restore_cpu_control_state_after_jni() {
4676 // Either restore the MXCSR register after returning from the JNI Call
4677 // or verify that it wasn't changed (with -Xcheck:jni flag).
4678 if (VM_Version::supports_sse()) {
4679 if (RestoreMXCSROnJNICalls) {
4680 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4681 } else if (CheckJNICalls) {
4682 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4683 }
4684 }
4685 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4686 vzeroupper();
4687 // Reset k1 to 0xffff.
4688
4689 #ifdef COMPILER2
4690 if (PostLoopMultiversioning && VM_Version::supports_evex()) {
4691 push(rcx);
4692 movl(rcx, 0xffff);
4693 kmovwl(k1, rcx);
4694 pop(rcx);
4695 }
4696 #endif // COMPILER2
4697
4698 #ifndef _LP64
4699 // Either restore the x87 floating pointer control word after returning
4700 // from the JNI call or verify that it wasn't changed.
4701 if (CheckJNICalls) {
4702 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4703 }
4704 #endif // _LP64
4705 }
4706
4707 // ((OopHandle)result).resolve();
4708 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4709 assert_different_registers(result, tmp);
4710
4711 // Only 64 bit platforms support GCs that require a tmp register
4712 // Only IN_HEAP loads require a thread_tmp register
4713 // OopHandle::resolve is an indirection like jobject.
4714 access_load_at(T_OBJECT, IN_NATIVE,
4715 result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4716 }
4717
4718 // ((WeakHandle)result).resolve();
4719 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4720 assert_different_registers(rresult, rtmp);
4721 Label resolved;
4722
4723 // A null weak handle resolves to null.
4724 cmpptr(rresult, 0);
4725 jcc(Assembler::equal, resolved);
4726
4727 // Only 64 bit platforms support GCs that require a tmp register
4728 // Only IN_HEAP loads require a thread_tmp register
4729 // WeakHandle::resolve is an indirection like jweak.
4730 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4731 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4732 bind(resolved);
4733 }
4734
4735 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4736 // get mirror
4737 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4738 load_method_holder(mirror, method);
4739 movptr(mirror, Address(mirror, mirror_offset));
4740 resolve_oop_handle(mirror, tmp);
4741 }
4742
4743 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4744 load_method_holder(rresult, rmethod);
4745 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4746 }
4747
4748 void MacroAssembler::load_method_holder(Register holder, Register method) {
4749 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
4750 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
4751 movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4752 }
4753
4754 #ifdef _LP64
4755 void MacroAssembler::load_nklass(Register dst, Register src) {
4756 assert(UseCompressedClassPointers, "expect compressed class pointers");
4757
4758 if (!UseCompactObjectHeaders) {
4759 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4760 return;
4761 }
4762
4763 Label fast;
4764 movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
4765 testb(dst, markWord::monitor_value);
4766 jccb(Assembler::zero, fast);
4767
4768 // Fetch displaced header
4769 movq(dst, Address(dst, OM_OFFSET_NO_MONITOR_VALUE_TAG(header)));
4770
4771 bind(fast);
4772 shrq(dst, markWord::klass_shift);
4773 }
4774 #endif
4775
4776 void MacroAssembler::load_klass(Register dst, Register src, Register tmp, bool null_check_src) {
4777 assert_different_registers(src, tmp);
4778 assert_different_registers(dst, tmp);
4779 if (null_check_src) {
4780 if (UseCompactObjectHeaders) {
4781 null_check(src, oopDesc::mark_offset_in_bytes());
4782 } else {
4783 null_check(src, oopDesc::klass_offset_in_bytes());
4784 }
4785 }
4786 #ifdef _LP64
4787 if (UseCompressedClassPointers) {
4788 load_nklass(dst, src);
4789 decode_klass_not_null(dst, tmp);
4790 } else
4791 #endif
4792 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4793 }
4794
4795 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
4796 load_klass(dst, src, tmp);
4797 movptr(dst, Address(dst, Klass::prototype_header_offset()));
4798 }
4799
4800 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4801 assert(!UseCompactObjectHeaders, "not with compact headers");
4802 assert_different_registers(src, tmp);
4803 assert_different_registers(dst, tmp);
4804 #ifdef _LP64
4805 if (UseCompressedClassPointers) {
4806 encode_klass_not_null(src, tmp);
4807 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4808 } else
4809 #endif
4810 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4811 }
4812
4813 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) {
4814 #ifdef _LP64
4815 if (UseCompactObjectHeaders) {
4816 // NOTE: We need to deal with possible ObjectMonitor in object header.
4817 // Eventually we might be able to do simple movl & cmpl like in
4818 // the CCP path below.
4819 load_nklass(tmp, obj);
4820 cmpl(klass, tmp);
4821 } else if (UseCompressedClassPointers) {
4822 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
4823 } else
4824 #endif
4825 {
4826 cmpptr(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
4827 }
4828 }
4829
4830 void MacroAssembler::cmp_klass(Register src, Register dst, Register tmp1, Register tmp2) {
4831 #ifdef _LP64
4832 if (UseCompactObjectHeaders) {
4833 // NOTE: We need to deal with possible ObjectMonitor in object header.
4834 // Eventually we might be able to do simple movl & cmpl like in
4835 // the CCP path below.
4836 assert(tmp2 != noreg, "need tmp2");
4837 assert_different_registers(src, dst, tmp1, tmp2);
4838 load_nklass(tmp1, src);
4839 load_nklass(tmp2, dst);
4840 cmpl(tmp1, tmp2);
4841 } else if (UseCompressedClassPointers) {
4842 movl(tmp1, Address(src, oopDesc::klass_offset_in_bytes()));
4843 cmpl(tmp1, Address(dst, oopDesc::klass_offset_in_bytes()));
4844 } else
4845 #endif
4846 {
4847 movptr(tmp1, Address(src, oopDesc::klass_offset_in_bytes()));
4848 cmpptr(tmp1, Address(dst, oopDesc::klass_offset_in_bytes()));
4849 }
4850 }
4851
4852 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4853 Register tmp1, Register thread_tmp) {
4854 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4855 decorators = AccessInternal::decorator_fixup(decorators);
4856 bool as_raw = (decorators & AS_RAW) != 0;
4857 if (as_raw) {
4858 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4859 } else {
4860 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4861 }
4862 }
4863
4864 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4865 Register tmp1, Register tmp2) {
4866 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4867 decorators = AccessInternal::decorator_fixup(decorators);
4868 bool as_raw = (decorators & AS_RAW) != 0;
4869 if (as_raw) {
4870 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
4871 } else {
4872 bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
4873 }
4874 }
4875
4876 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4877 Register thread_tmp, DecoratorSet decorators) {
4878 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4879 }
4880
4881 // Doesn't do verfication, generates fixed size code
4882 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4883 Register thread_tmp, DecoratorSet decorators) {
4884 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4885 }
4886
4887 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4888 Register tmp2, DecoratorSet decorators) {
4889 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4890 }
4891
4892 // Used for storing NULLs.
4893 void MacroAssembler::store_heap_oop_null(Address dst) {
4894 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4895 }
4896
4897 #ifdef _LP64
4898 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4899 if (UseCompressedClassPointers) {
4900 // Store to klass gap in destination
4901 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4902 }
4903 }
4904
4905 #ifdef ASSERT
4906 void MacroAssembler::verify_heapbase(const char* msg) {
4907 assert (UseCompressedOops, "should be compressed");
4908 assert (Universe::heap() != NULL, "java heap should be initialized");
4909 if (CheckCompressedOops) {
4910 Label ok;
4911 push(rscratch1); // cmpptr trashes rscratch1
4912 cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4913 jcc(Assembler::equal, ok);
4914 STOP(msg);
4915 bind(ok);
4916 pop(rscratch1);
4917 }
4918 }
4919 #endif
4920
4921 // Algorithm must match oop.inline.hpp encode_heap_oop.
4922 void MacroAssembler::encode_heap_oop(Register r) {
4923 #ifdef ASSERT
4924 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4925 #endif
4926 verify_oop_msg(r, "broken oop in encode_heap_oop");
4927 if (CompressedOops::base() == NULL) {
4928 if (CompressedOops::shift() != 0) {
4929 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4930 shrq(r, LogMinObjAlignmentInBytes);
4931 }
4932 return;
4933 }
4934 testq(r, r);
4935 cmovq(Assembler::equal, r, r12_heapbase);
4936 subq(r, r12_heapbase);
4937 shrq(r, LogMinObjAlignmentInBytes);
4938 }
4939
4940 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4941 #ifdef ASSERT
4942 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4943 if (CheckCompressedOops) {
4944 Label ok;
4945 testq(r, r);
4946 jcc(Assembler::notEqual, ok);
4947 STOP("null oop passed to encode_heap_oop_not_null");
4948 bind(ok);
4949 }
4950 #endif
4951 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4952 if (CompressedOops::base() != NULL) {
4953 subq(r, r12_heapbase);
4954 }
4955 if (CompressedOops::shift() != 0) {
4956 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4957 shrq(r, LogMinObjAlignmentInBytes);
4958 }
4959 }
4960
4961 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4962 #ifdef ASSERT
4963 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4964 if (CheckCompressedOops) {
4965 Label ok;
4966 testq(src, src);
4967 jcc(Assembler::notEqual, ok);
4968 STOP("null oop passed to encode_heap_oop_not_null2");
4969 bind(ok);
4970 }
4971 #endif
4972 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4973 if (dst != src) {
4974 movq(dst, src);
4975 }
4976 if (CompressedOops::base() != NULL) {
4977 subq(dst, r12_heapbase);
4978 }
4979 if (CompressedOops::shift() != 0) {
4980 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4981 shrq(dst, LogMinObjAlignmentInBytes);
4982 }
4983 }
4984
4985 void MacroAssembler::decode_heap_oop(Register r) {
4986 #ifdef ASSERT
4987 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4988 #endif
4989 if (CompressedOops::base() == NULL) {
4990 if (CompressedOops::shift() != 0) {
4991 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4992 shlq(r, LogMinObjAlignmentInBytes);
4993 }
4994 } else {
4995 Label done;
4996 shlq(r, LogMinObjAlignmentInBytes);
4997 jccb(Assembler::equal, done);
4998 addq(r, r12_heapbase);
4999 bind(done);
5000 }
5001 verify_oop_msg(r, "broken oop in decode_heap_oop");
5002 }
5003
5004 void MacroAssembler::decode_heap_oop_not_null(Register r) {
5005 // Note: it will change flags
5006 assert (UseCompressedOops, "should only be used for compressed headers");
5007 assert (Universe::heap() != NULL, "java heap should be initialized");
5008 // Cannot assert, unverified entry point counts instructions (see .ad file)
5009 // vtableStubs also counts instructions in pd_code_size_limit.
5010 // Also do not verify_oop as this is called by verify_oop.
5011 if (CompressedOops::shift() != 0) {
5012 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5013 shlq(r, LogMinObjAlignmentInBytes);
5014 if (CompressedOops::base() != NULL) {
5015 addq(r, r12_heapbase);
5016 }
5017 } else {
5018 assert (CompressedOops::base() == NULL, "sanity");
5019 }
5020 }
5021
5022 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5023 // Note: it will change flags
5024 assert (UseCompressedOops, "should only be used for compressed headers");
5025 assert (Universe::heap() != NULL, "java heap should be initialized");
5026 // Cannot assert, unverified entry point counts instructions (see .ad file)
5027 // vtableStubs also counts instructions in pd_code_size_limit.
5028 // Also do not verify_oop as this is called by verify_oop.
5029 if (CompressedOops::shift() != 0) {
5030 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5031 if (LogMinObjAlignmentInBytes == Address::times_8) {
5032 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5033 } else {
5034 if (dst != src) {
5035 movq(dst, src);
5036 }
5037 shlq(dst, LogMinObjAlignmentInBytes);
5038 if (CompressedOops::base() != NULL) {
5039 addq(dst, r12_heapbase);
5040 }
5041 }
5042 } else {
5043 assert (CompressedOops::base() == NULL, "sanity");
5044 if (dst != src) {
5045 movq(dst, src);
5046 }
5047 }
5048 }
5049
5050 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5051 assert_different_registers(r, tmp);
5052 if (CompressedKlassPointers::base() != NULL) {
5053 mov64(tmp, (int64_t)CompressedKlassPointers::base());
5054 subq(r, tmp);
5055 }
5056 if (CompressedKlassPointers::shift() != 0) {
5057 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5058 shrq(r, LogKlassAlignmentInBytes);
5059 }
5060 }
5061
5062 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5063 assert_different_registers(src, dst);
5064 if (CompressedKlassPointers::base() != NULL) {
5065 mov64(dst, -(int64_t)CompressedKlassPointers::base());
5066 addq(dst, src);
5067 } else {
5068 movptr(dst, src);
5069 }
5070 if (CompressedKlassPointers::shift() != 0) {
5071 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5072 shrq(dst, LogKlassAlignmentInBytes);
5073 }
5074 }
5075
5076 // !!! If the instructions that get generated here change then function
5077 // instr_size_for_decode_klass_not_null() needs to get updated.
5078 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5079 assert_different_registers(r, tmp);
5080 // Note: it will change flags
5081 assert(UseCompressedClassPointers, "should only be used for compressed headers");
5082 // Cannot assert, unverified entry point counts instructions (see .ad file)
5083 // vtableStubs also counts instructions in pd_code_size_limit.
5084 // Also do not verify_oop as this is called by verify_oop.
5085 if (CompressedKlassPointers::shift() != 0) {
5086 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5087 shlq(r, LogKlassAlignmentInBytes);
5088 }
5089 if (CompressedKlassPointers::base() != NULL) {
5090 mov64(tmp, (int64_t)CompressedKlassPointers::base());
5091 addq(r, tmp);
5092 }
5093 }
5094
5095 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5096 assert_different_registers(src, dst);
5097 // Note: it will change flags
5098 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5099 // Cannot assert, unverified entry point counts instructions (see .ad file)
5100 // vtableStubs also counts instructions in pd_code_size_limit.
5101 // Also do not verify_oop as this is called by verify_oop.
5102
5103 if (CompressedKlassPointers::base() == NULL &&
5104 CompressedKlassPointers::shift() == 0) {
5105 // The best case scenario is that there is no base or shift. Then it is already
5106 // a pointer that needs nothing but a register rename.
5107 movl(dst, src);
5108 } else {
5109 if (CompressedKlassPointers::base() != NULL) {
5110 mov64(dst, (int64_t)CompressedKlassPointers::base());
5111 } else {
5112 xorq(dst, dst);
5113 }
5114 if (CompressedKlassPointers::shift() != 0) {
5115 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5116 assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5117 leaq(dst, Address(dst, src, Address::times_8, 0));
5118 } else {
5119 addq(dst, src);
5120 }
5121 }
5122 }
5123
5124 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5125 assert (UseCompressedOops, "should only be used for compressed headers");
5126 assert (Universe::heap() != NULL, "java heap should be initialized");
5127 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5128 int oop_index = oop_recorder()->find_index(obj);
5129 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5130 mov_narrow_oop(dst, oop_index, rspec);
5131 }
5132
5133 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5134 assert (UseCompressedOops, "should only be used for compressed headers");
5135 assert (Universe::heap() != NULL, "java heap should be initialized");
5136 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5137 int oop_index = oop_recorder()->find_index(obj);
5138 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5139 mov_narrow_oop(dst, oop_index, rspec);
5140 }
5141
5142 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5143 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5144 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5145 int klass_index = oop_recorder()->find_index(k);
5146 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5147 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5148 }
5149
5150 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5151 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5152 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5153 int klass_index = oop_recorder()->find_index(k);
5154 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5155 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5156 }
5157
5158 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5159 assert (UseCompressedOops, "should only be used for compressed headers");
5160 assert (Universe::heap() != NULL, "java heap should be initialized");
5161 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5162 int oop_index = oop_recorder()->find_index(obj);
5163 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5164 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5165 }
5166
5167 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5168 assert (UseCompressedOops, "should only be used for compressed headers");
5169 assert (Universe::heap() != NULL, "java heap should be initialized");
5170 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5171 int oop_index = oop_recorder()->find_index(obj);
5172 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5173 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5174 }
5175
5176 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5177 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5178 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5179 int klass_index = oop_recorder()->find_index(k);
5180 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5181 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5182 }
5183
5184 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5185 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5186 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5187 int klass_index = oop_recorder()->find_index(k);
5188 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5189 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5190 }
5191
5192 void MacroAssembler::reinit_heapbase() {
5193 if (UseCompressedOops) {
5194 if (Universe::heap() != NULL) {
5195 if (CompressedOops::base() == NULL) {
5196 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5197 } else {
5198 mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5199 }
5200 } else {
5201 movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5202 }
5203 }
5204 }
5205
5206 #endif // _LP64
5207
5208 // C2 compiled method's prolog code.
5209 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5210
5211 // WARNING: Initial instruction MUST be 5 bytes or longer so that
5212 // NativeJump::patch_verified_entry will be able to patch out the entry
5213 // code safely. The push to verify stack depth is ok at 5 bytes,
5214 // the frame allocation can be either 3 or 6 bytes. So if we don't do
5215 // stack bang then we must use the 6 byte frame allocation even if
5216 // we have no frame. :-(
5217 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5218
5219 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5220 // Remove word for return addr
5221 framesize -= wordSize;
5222 stack_bang_size -= wordSize;
5223
5224 // Calls to C2R adapters often do not accept exceptional returns.
5225 // We require that their callers must bang for them. But be careful, because
5226 // some VM calls (such as call site linkage) can use several kilobytes of
5227 // stack. But the stack safety zone should account for that.
5228 // See bugs 4446381, 4468289, 4497237.
5229 if (stack_bang_size > 0) {
5230 generate_stack_overflow_check(stack_bang_size);
5231
5232 // We always push rbp, so that on return to interpreter rbp, will be
5233 // restored correctly and we can correct the stack.
5234 push(rbp);
5235 // Save caller's stack pointer into RBP if the frame pointer is preserved.
5236 if (PreserveFramePointer) {
5237 mov(rbp, rsp);
5238 }
5239 // Remove word for ebp
5240 framesize -= wordSize;
5241
5242 // Create frame
5243 if (framesize) {
5244 subptr(rsp, framesize);
5245 }
5246 } else {
5247 // Create frame (force generation of a 4 byte immediate value)
5248 subptr_imm32(rsp, framesize);
5249
5250 // Save RBP register now.
5251 framesize -= wordSize;
5252 movptr(Address(rsp, framesize), rbp);
5253 // Save caller's stack pointer into RBP if the frame pointer is preserved.
5254 if (PreserveFramePointer) {
5255 movptr(rbp, rsp);
5256 if (framesize > 0) {
5257 addptr(rbp, framesize);
5258 }
5259 }
5260 }
5261
5262 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5263 framesize -= wordSize;
5264 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5265 }
5266
5267 #ifndef _LP64
5268 // If method sets FPU control word do it now
5269 if (fp_mode_24b) {
5270 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
5271 }
5272 if (UseSSE >= 2 && VerifyFPU) {
5273 verify_FPU(0, "FPU stack must be clean on entry");
5274 }
5275 #endif
5276
5277 #ifdef ASSERT
5278 if (VerifyStackAtCalls) {
5279 Label L;
5280 push(rax);
5281 mov(rax, rsp);
5282 andptr(rax, StackAlignmentInBytes-1);
5283 cmpptr(rax, StackAlignmentInBytes-wordSize);
5284 pop(rax);
5285 jcc(Assembler::equal, L);
5286 STOP("Stack is not properly aligned!");
5287 bind(L);
5288 }
5289 #endif
5290
5291 if (!is_stub) {
5292 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5293 bs->nmethod_entry_barrier(this);
5294 }
5295 }
5296
5297 #if COMPILER2_OR_JVMCI
5298
5299 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5300 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5301 // cnt - number of qwords (8-byte words).
5302 // base - start address, qword aligned.
5303 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5304 bool use64byteVector = MaxVectorSize == 64 && AVX3Threshold == 0;
5305 if (use64byteVector) {
5306 vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5307 } else if (MaxVectorSize >= 32) {
5308 vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5309 } else {
5310 pxor(xtmp, xtmp);
5311 }
5312 jmp(L_zero_64_bytes);
5313
5314 BIND(L_loop);
5315 if (MaxVectorSize >= 32) {
5316 fill64_avx(base, 0, xtmp, use64byteVector);
5317 } else {
5318 movdqu(Address(base, 0), xtmp);
5319 movdqu(Address(base, 16), xtmp);
5320 movdqu(Address(base, 32), xtmp);
5321 movdqu(Address(base, 48), xtmp);
5322 }
5323 addptr(base, 64);
5324
5325 BIND(L_zero_64_bytes);
5326 subptr(cnt, 8);
5327 jccb(Assembler::greaterEqual, L_loop);
5328
5329 // Copy trailing 64 bytes
5330 if (use64byteVector) {
5331 addptr(cnt, 8);
5332 jccb(Assembler::equal, L_end);
5333 fill64_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp, true);
5334 jmp(L_end);
5335 } else {
5336 addptr(cnt, 4);
5337 jccb(Assembler::less, L_tail);
5338 if (MaxVectorSize >= 32) {
5339 vmovdqu(Address(base, 0), xtmp);
5340 } else {
5341 movdqu(Address(base, 0), xtmp);
5342 movdqu(Address(base, 16), xtmp);
5343 }
5344 }
5345 addptr(base, 32);
5346 subptr(cnt, 4);
5347
5348 BIND(L_tail);
5349 addptr(cnt, 4);
5350 jccb(Assembler::lessEqual, L_end);
5351 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5352 fill32_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp);
5353 } else {
5354 decrement(cnt);
5355
5356 BIND(L_sloop);
5357 movq(Address(base, 0), xtmp);
5358 addptr(base, 8);
5359 decrement(cnt);
5360 jccb(Assembler::greaterEqual, L_sloop);
5361 }
5362 BIND(L_end);
5363 }
5364
5365 // Clearing constant sized memory using YMM/ZMM registers.
5366 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5367 assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5368 bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
5369
5370 int vector64_count = (cnt & (~0x7)) >> 3;
5371 cnt = cnt & 0x7;
5372 const int fill64_per_loop = 4;
5373 const int max_unrolled_fill64 = 8;
5374
5375 // 64 byte initialization loop.
5376 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5377 int start64 = 0;
5378 if (vector64_count > max_unrolled_fill64) {
5379 Label LOOP;
5380 Register index = rtmp;
5381
5382 start64 = vector64_count - (vector64_count % fill64_per_loop);
5383
5384 movl(index, 0);
5385 BIND(LOOP);
5386 for (int i = 0; i < fill64_per_loop; i++) {
5387 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5388 }
5389 addl(index, fill64_per_loop * 64);
5390 cmpl(index, start64 * 64);
5391 jccb(Assembler::less, LOOP);
5392 }
5393 for (int i = start64; i < vector64_count; i++) {
5394 fill64_avx(base, i * 64, xtmp, use64byteVector);
5395 }
5396
5397 // Clear remaining 64 byte tail.
5398 int disp = vector64_count * 64;
5399 if (cnt) {
5400 switch (cnt) {
5401 case 1:
5402 movq(Address(base, disp), xtmp);
5403 break;
5404 case 2:
5405 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit);
5406 break;
5407 case 3:
5408 movl(rtmp, 0x7);
5409 kmovwl(mask, rtmp);
5410 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit);
5411 break;
5412 case 4:
5413 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5414 break;
5415 case 5:
5416 if (use64byteVector) {
5417 movl(rtmp, 0x1F);
5418 kmovwl(mask, rtmp);
5419 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5420 } else {
5421 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5422 movq(Address(base, disp + 32), xtmp);
5423 }
5424 break;
5425 case 6:
5426 if (use64byteVector) {
5427 movl(rtmp, 0x3F);
5428 kmovwl(mask, rtmp);
5429 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5430 } else {
5431 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5432 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit);
5433 }
5434 break;
5435 case 7:
5436 if (use64byteVector) {
5437 movl(rtmp, 0x7F);
5438 kmovwl(mask, rtmp);
5439 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5440 } else {
5441 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5442 movl(rtmp, 0x7);
5443 kmovwl(mask, rtmp);
5444 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit);
5445 }
5446 break;
5447 default:
5448 fatal("Unexpected length : %d\n",cnt);
5449 break;
5450 }
5451 }
5452 }
5453
5454 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5455 bool is_large, KRegister mask) {
5456 // cnt - number of qwords (8-byte words).
5457 // base - start address, qword aligned.
5458 // is_large - if optimizers know cnt is larger than InitArrayShortSize
5459 assert(base==rdi, "base register must be edi for rep stos");
5460 assert(tmp==rax, "tmp register must be eax for rep stos");
5461 assert(cnt==rcx, "cnt register must be ecx for rep stos");
5462 assert(InitArrayShortSize % BytesPerLong == 0,
5463 "InitArrayShortSize should be the multiple of BytesPerLong");
5464
5465 Label DONE;
5466 if (!is_large || !UseXMMForObjInit) {
5467 xorptr(tmp, tmp);
5468 }
5469
5470 if (!is_large) {
5471 Label LOOP, LONG;
5472 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5473 jccb(Assembler::greater, LONG);
5474
5475 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5476
5477 decrement(cnt);
5478 jccb(Assembler::negative, DONE); // Zero length
5479
5480 // Use individual pointer-sized stores for small counts:
5481 BIND(LOOP);
5482 movptr(Address(base, cnt, Address::times_ptr), tmp);
5483 decrement(cnt);
5484 jccb(Assembler::greaterEqual, LOOP);
5485 jmpb(DONE);
5486
5487 BIND(LONG);
5488 }
5489
5490 // Use longer rep-prefixed ops for non-small counts:
5491 if (UseFastStosb) {
5492 shlptr(cnt, 3); // convert to number of bytes
5493 rep_stosb();
5494 } else if (UseXMMForObjInit) {
5495 xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5496 } else {
5497 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5498 rep_stos();
5499 }
5500
5501 BIND(DONE);
5502 }
5503
5504 #endif //COMPILER2_OR_JVMCI
5505
5506
5507 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5508 Register to, Register value, Register count,
5509 Register rtmp, XMMRegister xtmp) {
5510 ShortBranchVerifier sbv(this);
5511 assert_different_registers(to, value, count, rtmp);
5512 Label L_exit;
5513 Label L_fill_2_bytes, L_fill_4_bytes;
5514
5515 int shift = -1;
5516 switch (t) {
5517 case T_BYTE:
5518 shift = 2;
5519 break;
5520 case T_SHORT:
5521 shift = 1;
5522 break;
5523 case T_INT:
5524 shift = 0;
5525 break;
5526 default: ShouldNotReachHere();
5527 }
5528
5529 if (t == T_BYTE) {
5530 andl(value, 0xff);
5531 movl(rtmp, value);
5532 shll(rtmp, 8);
5533 orl(value, rtmp);
5534 }
5535 if (t == T_SHORT) {
5536 andl(value, 0xffff);
5537 }
5538 if (t == T_BYTE || t == T_SHORT) {
5539 movl(rtmp, value);
5540 shll(rtmp, 16);
5541 orl(value, rtmp);
5542 }
5543
5544 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5545 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5546 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5547 Label L_skip_align2;
5548 // align source address at 4 bytes address boundary
5549 if (t == T_BYTE) {
5550 Label L_skip_align1;
5551 // One byte misalignment happens only for byte arrays
5552 testptr(to, 1);
5553 jccb(Assembler::zero, L_skip_align1);
5554 movb(Address(to, 0), value);
5555 increment(to);
5556 decrement(count);
5557 BIND(L_skip_align1);
5558 }
5559 // Two bytes misalignment happens only for byte and short (char) arrays
5560 testptr(to, 2);
5561 jccb(Assembler::zero, L_skip_align2);
5562 movw(Address(to, 0), value);
5563 addptr(to, 2);
5564 subl(count, 1<<(shift-1));
5565 BIND(L_skip_align2);
5566 }
5567 if (UseSSE < 2) {
5568 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5569 // Fill 32-byte chunks
5570 subl(count, 8 << shift);
5571 jcc(Assembler::less, L_check_fill_8_bytes);
5572 align(16);
5573
5574 BIND(L_fill_32_bytes_loop);
5575
5576 for (int i = 0; i < 32; i += 4) {
5577 movl(Address(to, i), value);
5578 }
5579
5580 addptr(to, 32);
5581 subl(count, 8 << shift);
5582 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5583 BIND(L_check_fill_8_bytes);
5584 addl(count, 8 << shift);
5585 jccb(Assembler::zero, L_exit);
5586 jmpb(L_fill_8_bytes);
5587
5588 //
5589 // length is too short, just fill qwords
5590 //
5591 BIND(L_fill_8_bytes_loop);
5592 movl(Address(to, 0), value);
5593 movl(Address(to, 4), value);
5594 addptr(to, 8);
5595 BIND(L_fill_8_bytes);
5596 subl(count, 1 << (shift + 1));
5597 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5598 // fall through to fill 4 bytes
5599 } else {
5600 Label L_fill_32_bytes;
5601 if (!UseUnalignedLoadStores) {
5602 // align to 8 bytes, we know we are 4 byte aligned to start
5603 testptr(to, 4);
5604 jccb(Assembler::zero, L_fill_32_bytes);
5605 movl(Address(to, 0), value);
5606 addptr(to, 4);
5607 subl(count, 1<<shift);
5608 }
5609 BIND(L_fill_32_bytes);
5610 {
5611 assert( UseSSE >= 2, "supported cpu only" );
5612 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5613 movdl(xtmp, value);
5614 if (UseAVX >= 2 && UseUnalignedLoadStores) {
5615 Label L_check_fill_32_bytes;
5616 if (UseAVX > 2) {
5617 // Fill 64-byte chunks
5618 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5619
5620 // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
5621 cmpl(count, AVX3Threshold);
5622 jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5623
5624 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5625
5626 subl(count, 16 << shift);
5627 jccb(Assembler::less, L_check_fill_32_bytes);
5628 align(16);
5629
5630 BIND(L_fill_64_bytes_loop_avx3);
5631 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5632 addptr(to, 64);
5633 subl(count, 16 << shift);
5634 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5635 jmpb(L_check_fill_32_bytes);
5636
5637 BIND(L_check_fill_64_bytes_avx2);
5638 }
5639 // Fill 64-byte chunks
5640 Label L_fill_64_bytes_loop;
5641 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5642
5643 subl(count, 16 << shift);
5644 jcc(Assembler::less, L_check_fill_32_bytes);
5645 align(16);
5646
5647 BIND(L_fill_64_bytes_loop);
5648 vmovdqu(Address(to, 0), xtmp);
5649 vmovdqu(Address(to, 32), xtmp);
5650 addptr(to, 64);
5651 subl(count, 16 << shift);
5652 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5653
5654 BIND(L_check_fill_32_bytes);
5655 addl(count, 8 << shift);
5656 jccb(Assembler::less, L_check_fill_8_bytes);
5657 vmovdqu(Address(to, 0), xtmp);
5658 addptr(to, 32);
5659 subl(count, 8 << shift);
5660
5661 BIND(L_check_fill_8_bytes);
5662 // clean upper bits of YMM registers
5663 movdl(xtmp, value);
5664 pshufd(xtmp, xtmp, 0);
5665 } else {
5666 // Fill 32-byte chunks
5667 pshufd(xtmp, xtmp, 0);
5668
5669 subl(count, 8 << shift);
5670 jcc(Assembler::less, L_check_fill_8_bytes);
5671 align(16);
5672
5673 BIND(L_fill_32_bytes_loop);
5674
5675 if (UseUnalignedLoadStores) {
5676 movdqu(Address(to, 0), xtmp);
5677 movdqu(Address(to, 16), xtmp);
5678 } else {
5679 movq(Address(to, 0), xtmp);
5680 movq(Address(to, 8), xtmp);
5681 movq(Address(to, 16), xtmp);
5682 movq(Address(to, 24), xtmp);
5683 }
5684
5685 addptr(to, 32);
5686 subl(count, 8 << shift);
5687 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5688
5689 BIND(L_check_fill_8_bytes);
5690 }
5691 addl(count, 8 << shift);
5692 jccb(Assembler::zero, L_exit);
5693 jmpb(L_fill_8_bytes);
5694
5695 //
5696 // length is too short, just fill qwords
5697 //
5698 BIND(L_fill_8_bytes_loop);
5699 movq(Address(to, 0), xtmp);
5700 addptr(to, 8);
5701 BIND(L_fill_8_bytes);
5702 subl(count, 1 << (shift + 1));
5703 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5704 }
5705 }
5706 // fill trailing 4 bytes
5707 BIND(L_fill_4_bytes);
5708 testl(count, 1<<shift);
5709 jccb(Assembler::zero, L_fill_2_bytes);
5710 movl(Address(to, 0), value);
5711 if (t == T_BYTE || t == T_SHORT) {
5712 Label L_fill_byte;
5713 addptr(to, 4);
5714 BIND(L_fill_2_bytes);
5715 // fill trailing 2 bytes
5716 testl(count, 1<<(shift-1));
5717 jccb(Assembler::zero, L_fill_byte);
5718 movw(Address(to, 0), value);
5719 if (t == T_BYTE) {
5720 addptr(to, 2);
5721 BIND(L_fill_byte);
5722 // fill trailing byte
5723 testl(count, 1);
5724 jccb(Assembler::zero, L_exit);
5725 movb(Address(to, 0), value);
5726 } else {
5727 BIND(L_fill_byte);
5728 }
5729 } else {
5730 BIND(L_fill_2_bytes);
5731 }
5732 BIND(L_exit);
5733 }
5734
5735 // encode char[] to byte[] in ISO_8859_1 or ASCII
5736 //@IntrinsicCandidate
5737 //private static int implEncodeISOArray(byte[] sa, int sp,
5738 //byte[] da, int dp, int len) {
5739 // int i = 0;
5740 // for (; i < len; i++) {
5741 // char c = StringUTF16.getChar(sa, sp++);
5742 // if (c > '\u00FF')
5743 // break;
5744 // da[dp++] = (byte)c;
5745 // }
5746 // return i;
5747 //}
5748 //
5749 //@IntrinsicCandidate
5750 //private static int implEncodeAsciiArray(char[] sa, int sp,
5751 // byte[] da, int dp, int len) {
5752 // int i = 0;
5753 // for (; i < len; i++) {
5754 // char c = sa[sp++];
5755 // if (c >= '\u0080')
5756 // break;
5757 // da[dp++] = (byte)c;
5758 // }
5759 // return i;
5760 //}
5761 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5762 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5763 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5764 Register tmp5, Register result, bool ascii) {
5765
5766 // rsi: src
5767 // rdi: dst
5768 // rdx: len
5769 // rcx: tmp5
5770 // rax: result
5771 ShortBranchVerifier sbv(this);
5772 assert_different_registers(src, dst, len, tmp5, result);
5773 Label L_done, L_copy_1_char, L_copy_1_char_exit;
5774
5775 int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5776 int short_mask = ascii ? 0xff80 : 0xff00;
5777
5778 // set result
5779 xorl(result, result);
5780 // check for zero length
5781 testl(len, len);
5782 jcc(Assembler::zero, L_done);
5783
5784 movl(result, len);
5785
5786 // Setup pointers
5787 lea(src, Address(src, len, Address::times_2)); // char[]
5788 lea(dst, Address(dst, len, Address::times_1)); // byte[]
5789 negptr(len);
5790
5791 if (UseSSE42Intrinsics || UseAVX >= 2) {
5792 Label L_copy_8_chars, L_copy_8_chars_exit;
5793 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5794
5795 if (UseAVX >= 2) {
5796 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5797 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
5798 movdl(tmp1Reg, tmp5);
5799 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5800 jmp(L_chars_32_check);
5801
5802 bind(L_copy_32_chars);
5803 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5804 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5805 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5806 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
5807 jccb(Assembler::notZero, L_copy_32_chars_exit);
5808 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5809 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5810 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5811
5812 bind(L_chars_32_check);
5813 addptr(len, 32);
5814 jcc(Assembler::lessEqual, L_copy_32_chars);
5815
5816 bind(L_copy_32_chars_exit);
5817 subptr(len, 16);
5818 jccb(Assembler::greater, L_copy_16_chars_exit);
5819
5820 } else if (UseSSE42Intrinsics) {
5821 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
5822 movdl(tmp1Reg, tmp5);
5823 pshufd(tmp1Reg, tmp1Reg, 0);
5824 jmpb(L_chars_16_check);
5825 }
5826
5827 bind(L_copy_16_chars);
5828 if (UseAVX >= 2) {
5829 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5830 vptest(tmp2Reg, tmp1Reg);
5831 jcc(Assembler::notZero, L_copy_16_chars_exit);
5832 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5833 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5834 } else {
5835 if (UseAVX > 0) {
5836 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5837 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5838 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5839 } else {
5840 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5841 por(tmp2Reg, tmp3Reg);
5842 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5843 por(tmp2Reg, tmp4Reg);
5844 }
5845 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
5846 jccb(Assembler::notZero, L_copy_16_chars_exit);
5847 packuswb(tmp3Reg, tmp4Reg);
5848 }
5849 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5850
5851 bind(L_chars_16_check);
5852 addptr(len, 16);
5853 jcc(Assembler::lessEqual, L_copy_16_chars);
5854
5855 bind(L_copy_16_chars_exit);
5856 if (UseAVX >= 2) {
5857 // clean upper bits of YMM registers
5858 vpxor(tmp2Reg, tmp2Reg);
5859 vpxor(tmp3Reg, tmp3Reg);
5860 vpxor(tmp4Reg, tmp4Reg);
5861 movdl(tmp1Reg, tmp5);
5862 pshufd(tmp1Reg, tmp1Reg, 0);
5863 }
5864 subptr(len, 8);
5865 jccb(Assembler::greater, L_copy_8_chars_exit);
5866
5867 bind(L_copy_8_chars);
5868 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5869 ptest(tmp3Reg, tmp1Reg);
5870 jccb(Assembler::notZero, L_copy_8_chars_exit);
5871 packuswb(tmp3Reg, tmp1Reg);
5872 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5873 addptr(len, 8);
5874 jccb(Assembler::lessEqual, L_copy_8_chars);
5875
5876 bind(L_copy_8_chars_exit);
5877 subptr(len, 8);
5878 jccb(Assembler::zero, L_done);
5879 }
5880
5881 bind(L_copy_1_char);
5882 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5883 testl(tmp5, short_mask); // check if Unicode or non-ASCII char
5884 jccb(Assembler::notZero, L_copy_1_char_exit);
5885 movb(Address(dst, len, Address::times_1, 0), tmp5);
5886 addptr(len, 1);
5887 jccb(Assembler::less, L_copy_1_char);
5888
5889 bind(L_copy_1_char_exit);
5890 addptr(result, len); // len is negative count of not processed elements
5891
5892 bind(L_done);
5893 }
5894
5895 #ifdef _LP64
5896 /**
5897 * Helper for multiply_to_len().
5898 */
5899 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5900 addq(dest_lo, src1);
5901 adcq(dest_hi, 0);
5902 addq(dest_lo, src2);
5903 adcq(dest_hi, 0);
5904 }
5905
5906 /**
5907 * Multiply 64 bit by 64 bit first loop.
5908 */
5909 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5910 Register y, Register y_idx, Register z,
5911 Register carry, Register product,
5912 Register idx, Register kdx) {
5913 //
5914 // jlong carry, x[], y[], z[];
5915 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5916 // huge_128 product = y[idx] * x[xstart] + carry;
5917 // z[kdx] = (jlong)product;
5918 // carry = (jlong)(product >>> 64);
5919 // }
5920 // z[xstart] = carry;
5921 //
5922
5923 Label L_first_loop, L_first_loop_exit;
5924 Label L_one_x, L_one_y, L_multiply;
5925
5926 decrementl(xstart);
5927 jcc(Assembler::negative, L_one_x);
5928
5929 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
5930 rorq(x_xstart, 32); // convert big-endian to little-endian
5931
5932 bind(L_first_loop);
5933 decrementl(idx);
5934 jcc(Assembler::negative, L_first_loop_exit);
5935 decrementl(idx);
5936 jcc(Assembler::negative, L_one_y);
5937 movq(y_idx, Address(y, idx, Address::times_4, 0));
5938 rorq(y_idx, 32); // convert big-endian to little-endian
5939 bind(L_multiply);
5940 movq(product, x_xstart);
5941 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5942 addq(product, carry);
5943 adcq(rdx, 0);
5944 subl(kdx, 2);
5945 movl(Address(z, kdx, Address::times_4, 4), product);
5946 shrq(product, 32);
5947 movl(Address(z, kdx, Address::times_4, 0), product);
5948 movq(carry, rdx);
5949 jmp(L_first_loop);
5950
5951 bind(L_one_y);
5952 movl(y_idx, Address(y, 0));
5953 jmp(L_multiply);
5954
5955 bind(L_one_x);
5956 movl(x_xstart, Address(x, 0));
5957 jmp(L_first_loop);
5958
5959 bind(L_first_loop_exit);
5960 }
5961
5962 /**
5963 * Multiply 64 bit by 64 bit and add 128 bit.
5964 */
5965 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5966 Register yz_idx, Register idx,
5967 Register carry, Register product, int offset) {
5968 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5969 // z[kdx] = (jlong)product;
5970
5971 movq(yz_idx, Address(y, idx, Address::times_4, offset));
5972 rorq(yz_idx, 32); // convert big-endian to little-endian
5973 movq(product, x_xstart);
5974 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5975 movq(yz_idx, Address(z, idx, Address::times_4, offset));
5976 rorq(yz_idx, 32); // convert big-endian to little-endian
5977
5978 add2_with_carry(rdx, product, carry, yz_idx);
5979
5980 movl(Address(z, idx, Address::times_4, offset+4), product);
5981 shrq(product, 32);
5982 movl(Address(z, idx, Address::times_4, offset), product);
5983
5984 }
5985
5986 /**
5987 * Multiply 128 bit by 128 bit. Unrolled inner loop.
5988 */
5989 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5990 Register yz_idx, Register idx, Register jdx,
5991 Register carry, Register product,
5992 Register carry2) {
5993 // jlong carry, x[], y[], z[];
5994 // int kdx = ystart+1;
5995 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5996 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5997 // z[kdx+idx+1] = (jlong)product;
5998 // jlong carry2 = (jlong)(product >>> 64);
5999 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6000 // z[kdx+idx] = (jlong)product;
6001 // carry = (jlong)(product >>> 64);
6002 // }
6003 // idx += 2;
6004 // if (idx > 0) {
6005 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6006 // z[kdx+idx] = (jlong)product;
6007 // carry = (jlong)(product >>> 64);
6008 // }
6009 //
6010
6011 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6012
6013 movl(jdx, idx);
6014 andl(jdx, 0xFFFFFFFC);
6015 shrl(jdx, 2);
6016
6017 bind(L_third_loop);
6018 subl(jdx, 1);
6019 jcc(Assembler::negative, L_third_loop_exit);
6020 subl(idx, 4);
6021
6022 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6023 movq(carry2, rdx);
6024
6025 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6026 movq(carry, rdx);
6027 jmp(L_third_loop);
6028
6029 bind (L_third_loop_exit);
6030
6031 andl (idx, 0x3);
6032 jcc(Assembler::zero, L_post_third_loop_done);
6033
6034 Label L_check_1;
6035 subl(idx, 2);
6036 jcc(Assembler::negative, L_check_1);
6037
6038 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6039 movq(carry, rdx);
6040
6041 bind (L_check_1);
6042 addl (idx, 0x2);
6043 andl (idx, 0x1);
6044 subl(idx, 1);
6045 jcc(Assembler::negative, L_post_third_loop_done);
6046
6047 movl(yz_idx, Address(y, idx, Address::times_4, 0));
6048 movq(product, x_xstart);
6049 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6050 movl(yz_idx, Address(z, idx, Address::times_4, 0));
6051
6052 add2_with_carry(rdx, product, yz_idx, carry);
6053
6054 movl(Address(z, idx, Address::times_4, 0), product);
6055 shrq(product, 32);
6056
6057 shlq(rdx, 32);
6058 orq(product, rdx);
6059 movq(carry, product);
6060
6061 bind(L_post_third_loop_done);
6062 }
6063
6064 /**
6065 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6066 *
6067 */
6068 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6069 Register carry, Register carry2,
6070 Register idx, Register jdx,
6071 Register yz_idx1, Register yz_idx2,
6072 Register tmp, Register tmp3, Register tmp4) {
6073 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6074
6075 // jlong carry, x[], y[], z[];
6076 // int kdx = ystart+1;
6077 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6078 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6079 // jlong carry2 = (jlong)(tmp3 >>> 64);
6080 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
6081 // carry = (jlong)(tmp4 >>> 64);
6082 // z[kdx+idx+1] = (jlong)tmp3;
6083 // z[kdx+idx] = (jlong)tmp4;
6084 // }
6085 // idx += 2;
6086 // if (idx > 0) {
6087 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6088 // z[kdx+idx] = (jlong)yz_idx1;
6089 // carry = (jlong)(yz_idx1 >>> 64);
6090 // }
6091 //
6092
6093 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6094
6095 movl(jdx, idx);
6096 andl(jdx, 0xFFFFFFFC);
6097 shrl(jdx, 2);
6098
6099 bind(L_third_loop);
6100 subl(jdx, 1);
6101 jcc(Assembler::negative, L_third_loop_exit);
6102 subl(idx, 4);
6103
6104 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
6105 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6106 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
6107 rorxq(yz_idx2, yz_idx2, 32);
6108
6109 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6110 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
6111
6112 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
6113 rorxq(yz_idx1, yz_idx1, 32);
6114 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6115 rorxq(yz_idx2, yz_idx2, 32);
6116
6117 if (VM_Version::supports_adx()) {
6118 adcxq(tmp3, carry);
6119 adoxq(tmp3, yz_idx1);
6120
6121 adcxq(tmp4, tmp);
6122 adoxq(tmp4, yz_idx2);
6123
6124 movl(carry, 0); // does not affect flags
6125 adcxq(carry2, carry);
6126 adoxq(carry2, carry);
6127 } else {
6128 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6129 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6130 }
6131 movq(carry, carry2);
6132
6133 movl(Address(z, idx, Address::times_4, 12), tmp3);
6134 shrq(tmp3, 32);
6135 movl(Address(z, idx, Address::times_4, 8), tmp3);
6136
6137 movl(Address(z, idx, Address::times_4, 4), tmp4);
6138 shrq(tmp4, 32);
6139 movl(Address(z, idx, Address::times_4, 0), tmp4);
6140
6141 jmp(L_third_loop);
6142
6143 bind (L_third_loop_exit);
6144
6145 andl (idx, 0x3);
6146 jcc(Assembler::zero, L_post_third_loop_done);
6147
6148 Label L_check_1;
6149 subl(idx, 2);
6150 jcc(Assembler::negative, L_check_1);
6151
6152 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
6153 rorxq(yz_idx1, yz_idx1, 32);
6154 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6155 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6156 rorxq(yz_idx2, yz_idx2, 32);
6157
6158 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6159
6160 movl(Address(z, idx, Address::times_4, 4), tmp3);
6161 shrq(tmp3, 32);
6162 movl(Address(z, idx, Address::times_4, 0), tmp3);
6163 movq(carry, tmp4);
6164
6165 bind (L_check_1);
6166 addl (idx, 0x2);
6167 andl (idx, 0x1);
6168 subl(idx, 1);
6169 jcc(Assembler::negative, L_post_third_loop_done);
6170 movl(tmp4, Address(y, idx, Address::times_4, 0));
6171 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
6172 movl(tmp4, Address(z, idx, Address::times_4, 0));
6173
6174 add2_with_carry(carry2, tmp3, tmp4, carry);
6175
6176 movl(Address(z, idx, Address::times_4, 0), tmp3);
6177 shrq(tmp3, 32);
6178
6179 shlq(carry2, 32);
6180 orq(tmp3, carry2);
6181 movq(carry, tmp3);
6182
6183 bind(L_post_third_loop_done);
6184 }
6185
6186 /**
6187 * Code for BigInteger::multiplyToLen() instrinsic.
6188 *
6189 * rdi: x
6190 * rax: xlen
6191 * rsi: y
6192 * rcx: ylen
6193 * r8: z
6194 * r11: zlen
6195 * r12: tmp1
6196 * r13: tmp2
6197 * r14: tmp3
6198 * r15: tmp4
6199 * rbx: tmp5
6200 *
6201 */
6202 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6203 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6204 ShortBranchVerifier sbv(this);
6205 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6206
6207 push(tmp1);
6208 push(tmp2);
6209 push(tmp3);
6210 push(tmp4);
6211 push(tmp5);
6212
6213 push(xlen);
6214 push(zlen);
6215
6216 const Register idx = tmp1;
6217 const Register kdx = tmp2;
6218 const Register xstart = tmp3;
6219
6220 const Register y_idx = tmp4;
6221 const Register carry = tmp5;
6222 const Register product = xlen;
6223 const Register x_xstart = zlen; // reuse register
6224
6225 // First Loop.
6226 //
6227 // final static long LONG_MASK = 0xffffffffL;
6228 // int xstart = xlen - 1;
6229 // int ystart = ylen - 1;
6230 // long carry = 0;
6231 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6232 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6233 // z[kdx] = (int)product;
6234 // carry = product >>> 32;
6235 // }
6236 // z[xstart] = (int)carry;
6237 //
6238
6239 movl(idx, ylen); // idx = ylen;
6240 movl(kdx, zlen); // kdx = xlen+ylen;
6241 xorq(carry, carry); // carry = 0;
6242
6243 Label L_done;
6244
6245 movl(xstart, xlen);
6246 decrementl(xstart);
6247 jcc(Assembler::negative, L_done);
6248
6249 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6250
6251 Label L_second_loop;
6252 testl(kdx, kdx);
6253 jcc(Assembler::zero, L_second_loop);
6254
6255 Label L_carry;
6256 subl(kdx, 1);
6257 jcc(Assembler::zero, L_carry);
6258
6259 movl(Address(z, kdx, Address::times_4, 0), carry);
6260 shrq(carry, 32);
6261 subl(kdx, 1);
6262
6263 bind(L_carry);
6264 movl(Address(z, kdx, Address::times_4, 0), carry);
6265
6266 // Second and third (nested) loops.
6267 //
6268 // for (int i = xstart-1; i >= 0; i--) { // Second loop
6269 // carry = 0;
6270 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6271 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6272 // (z[k] & LONG_MASK) + carry;
6273 // z[k] = (int)product;
6274 // carry = product >>> 32;
6275 // }
6276 // z[i] = (int)carry;
6277 // }
6278 //
6279 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6280
6281 const Register jdx = tmp1;
6282
6283 bind(L_second_loop);
6284 xorl(carry, carry); // carry = 0;
6285 movl(jdx, ylen); // j = ystart+1
6286
6287 subl(xstart, 1); // i = xstart-1;
6288 jcc(Assembler::negative, L_done);
6289
6290 push (z);
6291
6292 Label L_last_x;
6293 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6294 subl(xstart, 1); // i = xstart-1;
6295 jcc(Assembler::negative, L_last_x);
6296
6297 if (UseBMI2Instructions) {
6298 movq(rdx, Address(x, xstart, Address::times_4, 0));
6299 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6300 } else {
6301 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6302 rorq(x_xstart, 32); // convert big-endian to little-endian
6303 }
6304
6305 Label L_third_loop_prologue;
6306 bind(L_third_loop_prologue);
6307
6308 push (x);
6309 push (xstart);
6310 push (ylen);
6311
6312
6313 if (UseBMI2Instructions) {
6314 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6315 } else { // !UseBMI2Instructions
6316 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6317 }
6318
6319 pop(ylen);
6320 pop(xlen);
6321 pop(x);
6322 pop(z);
6323
6324 movl(tmp3, xlen);
6325 addl(tmp3, 1);
6326 movl(Address(z, tmp3, Address::times_4, 0), carry);
6327 subl(tmp3, 1);
6328 jccb(Assembler::negative, L_done);
6329
6330 shrq(carry, 32);
6331 movl(Address(z, tmp3, Address::times_4, 0), carry);
6332 jmp(L_second_loop);
6333
6334 // Next infrequent code is moved outside loops.
6335 bind(L_last_x);
6336 if (UseBMI2Instructions) {
6337 movl(rdx, Address(x, 0));
6338 } else {
6339 movl(x_xstart, Address(x, 0));
6340 }
6341 jmp(L_third_loop_prologue);
6342
6343 bind(L_done);
6344
6345 pop(zlen);
6346 pop(xlen);
6347
6348 pop(tmp5);
6349 pop(tmp4);
6350 pop(tmp3);
6351 pop(tmp2);
6352 pop(tmp1);
6353 }
6354
6355 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6356 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6357 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6358 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6359 Label VECTOR8_TAIL, VECTOR4_TAIL;
6360 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6361 Label SAME_TILL_END, DONE;
6362 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6363
6364 //scale is in rcx in both Win64 and Unix
6365 ShortBranchVerifier sbv(this);
6366
6367 shlq(length);
6368 xorq(result, result);
6369
6370 if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6371 VM_Version::supports_avx512vlbw()) {
6372 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6373
6374 cmpq(length, 64);
6375 jcc(Assembler::less, VECTOR32_TAIL);
6376
6377 movq(tmp1, length);
6378 andq(tmp1, 0x3F); // tail count
6379 andq(length, ~(0x3F)); //vector count
6380
6381 bind(VECTOR64_LOOP);
6382 // AVX512 code to compare 64 byte vectors.
6383 evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit);
6384 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6385 kortestql(k7, k7);
6386 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
6387 addq(result, 64);
6388 subq(length, 64);
6389 jccb(Assembler::notZero, VECTOR64_LOOP);
6390
6391 //bind(VECTOR64_TAIL);
6392 testq(tmp1, tmp1);
6393 jcc(Assembler::zero, SAME_TILL_END);
6394
6395 //bind(VECTOR64_TAIL);
6396 // AVX512 code to compare upto 63 byte vectors.
6397 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6398 shlxq(tmp2, tmp2, tmp1);
6399 notq(tmp2);
6400 kmovql(k3, tmp2);
6401
6402 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6403 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6404
6405 ktestql(k7, k3);
6406 jcc(Assembler::below, SAME_TILL_END); // not mismatch
6407
6408 bind(VECTOR64_NOT_EQUAL);
6409 kmovql(tmp1, k7);
6410 notq(tmp1);
6411 tzcntq(tmp1, tmp1);
6412 addq(result, tmp1);
6413 shrq(result);
6414 jmp(DONE);
6415 bind(VECTOR32_TAIL);
6416 }
6417
6418 cmpq(length, 8);
6419 jcc(Assembler::equal, VECTOR8_LOOP);
6420 jcc(Assembler::less, VECTOR4_TAIL);
6421
6422 if (UseAVX >= 2) {
6423 Label VECTOR16_TAIL, VECTOR32_LOOP;
6424
6425 cmpq(length, 16);
6426 jcc(Assembler::equal, VECTOR16_LOOP);
6427 jcc(Assembler::less, VECTOR8_LOOP);
6428
6429 cmpq(length, 32);
6430 jccb(Assembler::less, VECTOR16_TAIL);
6431
6432 subq(length, 32);
6433 bind(VECTOR32_LOOP);
6434 vmovdqu(rymm0, Address(obja, result));
6435 vmovdqu(rymm1, Address(objb, result));
6436 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6437 vptest(rymm2, rymm2);
6438 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6439 addq(result, 32);
6440 subq(length, 32);
6441 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6442 addq(length, 32);
6443 jcc(Assembler::equal, SAME_TILL_END);
6444 //falling through if less than 32 bytes left //close the branch here.
6445
6446 bind(VECTOR16_TAIL);
6447 cmpq(length, 16);
6448 jccb(Assembler::less, VECTOR8_TAIL);
6449 bind(VECTOR16_LOOP);
6450 movdqu(rymm0, Address(obja, result));
6451 movdqu(rymm1, Address(objb, result));
6452 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6453 ptest(rymm2, rymm2);
6454 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6455 addq(result, 16);
6456 subq(length, 16);
6457 jcc(Assembler::equal, SAME_TILL_END);
6458 //falling through if less than 16 bytes left
6459 } else {//regular intrinsics
6460
6461 cmpq(length, 16);
6462 jccb(Assembler::less, VECTOR8_TAIL);
6463
6464 subq(length, 16);
6465 bind(VECTOR16_LOOP);
6466 movdqu(rymm0, Address(obja, result));
6467 movdqu(rymm1, Address(objb, result));
6468 pxor(rymm0, rymm1);
6469 ptest(rymm0, rymm0);
6470 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6471 addq(result, 16);
6472 subq(length, 16);
6473 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6474 addq(length, 16);
6475 jcc(Assembler::equal, SAME_TILL_END);
6476 //falling through if less than 16 bytes left
6477 }
6478
6479 bind(VECTOR8_TAIL);
6480 cmpq(length, 8);
6481 jccb(Assembler::less, VECTOR4_TAIL);
6482 bind(VECTOR8_LOOP);
6483 movq(tmp1, Address(obja, result));
6484 movq(tmp2, Address(objb, result));
6485 xorq(tmp1, tmp2);
6486 testq(tmp1, tmp1);
6487 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6488 addq(result, 8);
6489 subq(length, 8);
6490 jcc(Assembler::equal, SAME_TILL_END);
6491 //falling through if less than 8 bytes left
6492
6493 bind(VECTOR4_TAIL);
6494 cmpq(length, 4);
6495 jccb(Assembler::less, BYTES_TAIL);
6496 bind(VECTOR4_LOOP);
6497 movl(tmp1, Address(obja, result));
6498 xorl(tmp1, Address(objb, result));
6499 testl(tmp1, tmp1);
6500 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6501 addq(result, 4);
6502 subq(length, 4);
6503 jcc(Assembler::equal, SAME_TILL_END);
6504 //falling through if less than 4 bytes left
6505
6506 bind(BYTES_TAIL);
6507 bind(BYTES_LOOP);
6508 load_unsigned_byte(tmp1, Address(obja, result));
6509 load_unsigned_byte(tmp2, Address(objb, result));
6510 xorl(tmp1, tmp2);
6511 testl(tmp1, tmp1);
6512 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6513 decq(length);
6514 jcc(Assembler::zero, SAME_TILL_END);
6515 incq(result);
6516 load_unsigned_byte(tmp1, Address(obja, result));
6517 load_unsigned_byte(tmp2, Address(objb, result));
6518 xorl(tmp1, tmp2);
6519 testl(tmp1, tmp1);
6520 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6521 decq(length);
6522 jcc(Assembler::zero, SAME_TILL_END);
6523 incq(result);
6524 load_unsigned_byte(tmp1, Address(obja, result));
6525 load_unsigned_byte(tmp2, Address(objb, result));
6526 xorl(tmp1, tmp2);
6527 testl(tmp1, tmp1);
6528 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6529 jmp(SAME_TILL_END);
6530
6531 if (UseAVX >= 2) {
6532 bind(VECTOR32_NOT_EQUAL);
6533 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6534 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6535 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6536 vpmovmskb(tmp1, rymm0);
6537 bsfq(tmp1, tmp1);
6538 addq(result, tmp1);
6539 shrq(result);
6540 jmp(DONE);
6541 }
6542
6543 bind(VECTOR16_NOT_EQUAL);
6544 if (UseAVX >= 2) {
6545 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6546 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6547 pxor(rymm0, rymm2);
6548 } else {
6549 pcmpeqb(rymm2, rymm2);
6550 pxor(rymm0, rymm1);
6551 pcmpeqb(rymm0, rymm1);
6552 pxor(rymm0, rymm2);
6553 }
6554 pmovmskb(tmp1, rymm0);
6555 bsfq(tmp1, tmp1);
6556 addq(result, tmp1);
6557 shrq(result);
6558 jmpb(DONE);
6559
6560 bind(VECTOR8_NOT_EQUAL);
6561 bind(VECTOR4_NOT_EQUAL);
6562 bsfq(tmp1, tmp1);
6563 shrq(tmp1, 3);
6564 addq(result, tmp1);
6565 bind(BYTES_NOT_EQUAL);
6566 shrq(result);
6567 jmpb(DONE);
6568
6569 bind(SAME_TILL_END);
6570 mov64(result, -1);
6571
6572 bind(DONE);
6573 }
6574
6575 //Helper functions for square_to_len()
6576
6577 /**
6578 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6579 * Preserves x and z and modifies rest of the registers.
6580 */
6581 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6582 // Perform square and right shift by 1
6583 // Handle odd xlen case first, then for even xlen do the following
6584 // jlong carry = 0;
6585 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6586 // huge_128 product = x[j:j+1] * x[j:j+1];
6587 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6588 // z[i+2:i+3] = (jlong)(product >>> 1);
6589 // carry = (jlong)product;
6590 // }
6591
6592 xorq(tmp5, tmp5); // carry
6593 xorq(rdxReg, rdxReg);
6594 xorl(tmp1, tmp1); // index for x
6595 xorl(tmp4, tmp4); // index for z
6596
6597 Label L_first_loop, L_first_loop_exit;
6598
6599 testl(xlen, 1);
6600 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6601
6602 // Square and right shift by 1 the odd element using 32 bit multiply
6603 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6604 imulq(raxReg, raxReg);
6605 shrq(raxReg, 1);
6606 adcq(tmp5, 0);
6607 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6608 incrementl(tmp1);
6609 addl(tmp4, 2);
6610
6611 // Square and right shift by 1 the rest using 64 bit multiply
6612 bind(L_first_loop);
6613 cmpptr(tmp1, xlen);
6614 jccb(Assembler::equal, L_first_loop_exit);
6615
6616 // Square
6617 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
6618 rorq(raxReg, 32); // convert big-endian to little-endian
6619 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
6620
6621 // Right shift by 1 and save carry
6622 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6623 rcrq(rdxReg, 1);
6624 rcrq(raxReg, 1);
6625 adcq(tmp5, 0);
6626
6627 // Store result in z
6628 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6629 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6630
6631 // Update indices for x and z
6632 addl(tmp1, 2);
6633 addl(tmp4, 4);
6634 jmp(L_first_loop);
6635
6636 bind(L_first_loop_exit);
6637 }
6638
6639
6640 /**
6641 * Perform the following multiply add operation using BMI2 instructions
6642 * carry:sum = sum + op1*op2 + carry
6643 * op2 should be in rdx
6644 * op2 is preserved, all other registers are modified
6645 */
6646 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6647 // assert op2 is rdx
6648 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
6649 addq(sum, carry);
6650 adcq(tmp2, 0);
6651 addq(sum, op1);
6652 adcq(tmp2, 0);
6653 movq(carry, tmp2);
6654 }
6655
6656 /**
6657 * Perform the following multiply add operation:
6658 * carry:sum = sum + op1*op2 + carry
6659 * Preserves op1, op2 and modifies rest of registers
6660 */
6661 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6662 // rdx:rax = op1 * op2
6663 movq(raxReg, op2);
6664 mulq(op1);
6665
6666 // rdx:rax = sum + carry + rdx:rax
6667 addq(sum, carry);
6668 adcq(rdxReg, 0);
6669 addq(sum, raxReg);
6670 adcq(rdxReg, 0);
6671
6672 // carry:sum = rdx:sum
6673 movq(carry, rdxReg);
6674 }
6675
6676 /**
6677 * Add 64 bit long carry into z[] with carry propogation.
6678 * Preserves z and carry register values and modifies rest of registers.
6679 *
6680 */
6681 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6682 Label L_fourth_loop, L_fourth_loop_exit;
6683
6684 movl(tmp1, 1);
6685 subl(zlen, 2);
6686 addq(Address(z, zlen, Address::times_4, 0), carry);
6687
6688 bind(L_fourth_loop);
6689 jccb(Assembler::carryClear, L_fourth_loop_exit);
6690 subl(zlen, 2);
6691 jccb(Assembler::negative, L_fourth_loop_exit);
6692 addq(Address(z, zlen, Address::times_4, 0), tmp1);
6693 jmp(L_fourth_loop);
6694 bind(L_fourth_loop_exit);
6695 }
6696
6697 /**
6698 * Shift z[] left by 1 bit.
6699 * Preserves x, len, z and zlen registers and modifies rest of the registers.
6700 *
6701 */
6702 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6703
6704 Label L_fifth_loop, L_fifth_loop_exit;
6705
6706 // Fifth loop
6707 // Perform primitiveLeftShift(z, zlen, 1)
6708
6709 const Register prev_carry = tmp1;
6710 const Register new_carry = tmp4;
6711 const Register value = tmp2;
6712 const Register zidx = tmp3;
6713
6714 // int zidx, carry;
6715 // long value;
6716 // carry = 0;
6717 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6718 // (carry:value) = (z[i] << 1) | carry ;
6719 // z[i] = value;
6720 // }
6721
6722 movl(zidx, zlen);
6723 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6724
6725 bind(L_fifth_loop);
6726 decl(zidx); // Use decl to preserve carry flag
6727 decl(zidx);
6728 jccb(Assembler::negative, L_fifth_loop_exit);
6729
6730 if (UseBMI2Instructions) {
6731 movq(value, Address(z, zidx, Address::times_4, 0));
6732 rclq(value, 1);
6733 rorxq(value, value, 32);
6734 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
6735 }
6736 else {
6737 // clear new_carry
6738 xorl(new_carry, new_carry);
6739
6740 // Shift z[i] by 1, or in previous carry and save new carry
6741 movq(value, Address(z, zidx, Address::times_4, 0));
6742 shlq(value, 1);
6743 adcl(new_carry, 0);
6744
6745 orq(value, prev_carry);
6746 rorq(value, 0x20);
6747 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
6748
6749 // Set previous carry = new carry
6750 movl(prev_carry, new_carry);
6751 }
6752 jmp(L_fifth_loop);
6753
6754 bind(L_fifth_loop_exit);
6755 }
6756
6757
6758 /**
6759 * Code for BigInteger::squareToLen() intrinsic
6760 *
6761 * rdi: x
6762 * rsi: len
6763 * r8: z
6764 * rcx: zlen
6765 * r12: tmp1
6766 * r13: tmp2
6767 * r14: tmp3
6768 * r15: tmp4
6769 * rbx: tmp5
6770 *
6771 */
6772 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6773
6774 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6775 push(tmp1);
6776 push(tmp2);
6777 push(tmp3);
6778 push(tmp4);
6779 push(tmp5);
6780
6781 // First loop
6782 // Store the squares, right shifted one bit (i.e., divided by 2).
6783 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6784
6785 // Add in off-diagonal sums.
6786 //
6787 // Second, third (nested) and fourth loops.
6788 // zlen +=2;
6789 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6790 // carry = 0;
6791 // long op2 = x[xidx:xidx+1];
6792 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6793 // k -= 2;
6794 // long op1 = x[j:j+1];
6795 // long sum = z[k:k+1];
6796 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6797 // z[k:k+1] = sum;
6798 // }
6799 // add_one_64(z, k, carry, tmp_regs);
6800 // }
6801
6802 const Register carry = tmp5;
6803 const Register sum = tmp3;
6804 const Register op1 = tmp4;
6805 Register op2 = tmp2;
6806
6807 push(zlen);
6808 push(len);
6809 addl(zlen,2);
6810 bind(L_second_loop);
6811 xorq(carry, carry);
6812 subl(zlen, 4);
6813 subl(len, 2);
6814 push(zlen);
6815 push(len);
6816 cmpl(len, 0);
6817 jccb(Assembler::lessEqual, L_second_loop_exit);
6818
6819 // Multiply an array by one 64 bit long.
6820 if (UseBMI2Instructions) {
6821 op2 = rdxReg;
6822 movq(op2, Address(x, len, Address::times_4, 0));
6823 rorxq(op2, op2, 32);
6824 }
6825 else {
6826 movq(op2, Address(x, len, Address::times_4, 0));
6827 rorq(op2, 32);
6828 }
6829
6830 bind(L_third_loop);
6831 decrementl(len);
6832 jccb(Assembler::negative, L_third_loop_exit);
6833 decrementl(len);
6834 jccb(Assembler::negative, L_last_x);
6835
6836 movq(op1, Address(x, len, Address::times_4, 0));
6837 rorq(op1, 32);
6838
6839 bind(L_multiply);
6840 subl(zlen, 2);
6841 movq(sum, Address(z, zlen, Address::times_4, 0));
6842
6843 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6844 if (UseBMI2Instructions) {
6845 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6846 }
6847 else {
6848 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6849 }
6850
6851 movq(Address(z, zlen, Address::times_4, 0), sum);
6852
6853 jmp(L_third_loop);
6854 bind(L_third_loop_exit);
6855
6856 // Fourth loop
6857 // Add 64 bit long carry into z with carry propogation.
6858 // Uses offsetted zlen.
6859 add_one_64(z, zlen, carry, tmp1);
6860
6861 pop(len);
6862 pop(zlen);
6863 jmp(L_second_loop);
6864
6865 // Next infrequent code is moved outside loops.
6866 bind(L_last_x);
6867 movl(op1, Address(x, 0));
6868 jmp(L_multiply);
6869
6870 bind(L_second_loop_exit);
6871 pop(len);
6872 pop(zlen);
6873 pop(len);
6874 pop(zlen);
6875
6876 // Fifth loop
6877 // Shift z left 1 bit.
6878 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6879
6880 // z[zlen-1] |= x[len-1] & 1;
6881 movl(tmp3, Address(x, len, Address::times_4, -4));
6882 andl(tmp3, 1);
6883 orl(Address(z, zlen, Address::times_4, -4), tmp3);
6884
6885 pop(tmp5);
6886 pop(tmp4);
6887 pop(tmp3);
6888 pop(tmp2);
6889 pop(tmp1);
6890 }
6891
6892 /**
6893 * Helper function for mul_add()
6894 * Multiply the in[] by int k and add to out[] starting at offset offs using
6895 * 128 bit by 32 bit multiply and return the carry in tmp5.
6896 * Only quad int aligned length of in[] is operated on in this function.
6897 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6898 * This function preserves out, in and k registers.
6899 * len and offset point to the appropriate index in "in" & "out" correspondingly
6900 * tmp5 has the carry.
6901 * other registers are temporary and are modified.
6902 *
6903 */
6904 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6905 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6906 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6907
6908 Label L_first_loop, L_first_loop_exit;
6909
6910 movl(tmp1, len);
6911 shrl(tmp1, 2);
6912
6913 bind(L_first_loop);
6914 subl(tmp1, 1);
6915 jccb(Assembler::negative, L_first_loop_exit);
6916
6917 subl(len, 4);
6918 subl(offset, 4);
6919
6920 Register op2 = tmp2;
6921 const Register sum = tmp3;
6922 const Register op1 = tmp4;
6923 const Register carry = tmp5;
6924
6925 if (UseBMI2Instructions) {
6926 op2 = rdxReg;
6927 }
6928
6929 movq(op1, Address(in, len, Address::times_4, 8));
6930 rorq(op1, 32);
6931 movq(sum, Address(out, offset, Address::times_4, 8));
6932 rorq(sum, 32);
6933 if (UseBMI2Instructions) {
6934 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6935 }
6936 else {
6937 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6938 }
6939 // Store back in big endian from little endian
6940 rorq(sum, 0x20);
6941 movq(Address(out, offset, Address::times_4, 8), sum);
6942
6943 movq(op1, Address(in, len, Address::times_4, 0));
6944 rorq(op1, 32);
6945 movq(sum, Address(out, offset, Address::times_4, 0));
6946 rorq(sum, 32);
6947 if (UseBMI2Instructions) {
6948 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6949 }
6950 else {
6951 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6952 }
6953 // Store back in big endian from little endian
6954 rorq(sum, 0x20);
6955 movq(Address(out, offset, Address::times_4, 0), sum);
6956
6957 jmp(L_first_loop);
6958 bind(L_first_loop_exit);
6959 }
6960
6961 /**
6962 * Code for BigInteger::mulAdd() intrinsic
6963 *
6964 * rdi: out
6965 * rsi: in
6966 * r11: offs (out.length - offset)
6967 * rcx: len
6968 * r8: k
6969 * r12: tmp1
6970 * r13: tmp2
6971 * r14: tmp3
6972 * r15: tmp4
6973 * rbx: tmp5
6974 * Multiply the in[] by word k and add to out[], return the carry in rax
6975 */
6976 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6977 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6978 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6979
6980 Label L_carry, L_last_in, L_done;
6981
6982 // carry = 0;
6983 // for (int j=len-1; j >= 0; j--) {
6984 // long product = (in[j] & LONG_MASK) * kLong +
6985 // (out[offs] & LONG_MASK) + carry;
6986 // out[offs--] = (int)product;
6987 // carry = product >>> 32;
6988 // }
6989 //
6990 push(tmp1);
6991 push(tmp2);
6992 push(tmp3);
6993 push(tmp4);
6994 push(tmp5);
6995
6996 Register op2 = tmp2;
6997 const Register sum = tmp3;
6998 const Register op1 = tmp4;
6999 const Register carry = tmp5;
7000
7001 if (UseBMI2Instructions) {
7002 op2 = rdxReg;
7003 movl(op2, k);
7004 }
7005 else {
7006 movl(op2, k);
7007 }
7008
7009 xorq(carry, carry);
7010
7011 //First loop
7012
7013 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7014 //The carry is in tmp5
7015 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7016
7017 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7018 decrementl(len);
7019 jccb(Assembler::negative, L_carry);
7020 decrementl(len);
7021 jccb(Assembler::negative, L_last_in);
7022
7023 movq(op1, Address(in, len, Address::times_4, 0));
7024 rorq(op1, 32);
7025
7026 subl(offs, 2);
7027 movq(sum, Address(out, offs, Address::times_4, 0));
7028 rorq(sum, 32);
7029
7030 if (UseBMI2Instructions) {
7031 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7032 }
7033 else {
7034 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7035 }
7036
7037 // Store back in big endian from little endian
7038 rorq(sum, 0x20);
7039 movq(Address(out, offs, Address::times_4, 0), sum);
7040
7041 testl(len, len);
7042 jccb(Assembler::zero, L_carry);
7043
7044 //Multiply the last in[] entry, if any
7045 bind(L_last_in);
7046 movl(op1, Address(in, 0));
7047 movl(sum, Address(out, offs, Address::times_4, -4));
7048
7049 movl(raxReg, k);
7050 mull(op1); //tmp4 * eax -> edx:eax
7051 addl(sum, carry);
7052 adcl(rdxReg, 0);
7053 addl(sum, raxReg);
7054 adcl(rdxReg, 0);
7055 movl(carry, rdxReg);
7056
7057 movl(Address(out, offs, Address::times_4, -4), sum);
7058
7059 bind(L_carry);
7060 //return tmp5/carry as carry in rax
7061 movl(rax, carry);
7062
7063 bind(L_done);
7064 pop(tmp5);
7065 pop(tmp4);
7066 pop(tmp3);
7067 pop(tmp2);
7068 pop(tmp1);
7069 }
7070 #endif
7071
7072 /**
7073 * Emits code to update CRC-32 with a byte value according to constants in table
7074 *
7075 * @param [in,out]crc Register containing the crc.
7076 * @param [in]val Register containing the byte to fold into the CRC.
7077 * @param [in]table Register containing the table of crc constants.
7078 *
7079 * uint32_t crc;
7080 * val = crc_table[(val ^ crc) & 0xFF];
7081 * crc = val ^ (crc >> 8);
7082 *
7083 */
7084 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7085 xorl(val, crc);
7086 andl(val, 0xFF);
7087 shrl(crc, 8); // unsigned shift
7088 xorl(crc, Address(table, val, Address::times_4, 0));
7089 }
7090
7091 /**
7092 * Fold 128-bit data chunk
7093 */
7094 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7095 if (UseAVX > 0) {
7096 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7097 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7098 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7099 pxor(xcrc, xtmp);
7100 } else {
7101 movdqa(xtmp, xcrc);
7102 pclmulhdq(xtmp, xK); // [123:64]
7103 pclmulldq(xcrc, xK); // [63:0]
7104 pxor(xcrc, xtmp);
7105 movdqu(xtmp, Address(buf, offset));
7106 pxor(xcrc, xtmp);
7107 }
7108 }
7109
7110 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7111 if (UseAVX > 0) {
7112 vpclmulhdq(xtmp, xK, xcrc);
7113 vpclmulldq(xcrc, xK, xcrc);
7114 pxor(xcrc, xbuf);
7115 pxor(xcrc, xtmp);
7116 } else {
7117 movdqa(xtmp, xcrc);
7118 pclmulhdq(xtmp, xK);
7119 pclmulldq(xcrc, xK);
7120 pxor(xcrc, xbuf);
7121 pxor(xcrc, xtmp);
7122 }
7123 }
7124
7125 /**
7126 * 8-bit folds to compute 32-bit CRC
7127 *
7128 * uint64_t xcrc;
7129 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7130 */
7131 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7132 movdl(tmp, xcrc);
7133 andl(tmp, 0xFF);
7134 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7135 psrldq(xcrc, 1); // unsigned shift one byte
7136 pxor(xcrc, xtmp);
7137 }
7138
7139 /**
7140 * uint32_t crc;
7141 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7142 */
7143 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7144 movl(tmp, crc);
7145 andl(tmp, 0xFF);
7146 shrl(crc, 8);
7147 xorl(crc, Address(table, tmp, Address::times_4, 0));
7148 }
7149
7150 /**
7151 * @param crc register containing existing CRC (32-bit)
7152 * @param buf register pointing to input byte buffer (byte*)
7153 * @param len register containing number of bytes
7154 * @param table register that will contain address of CRC table
7155 * @param tmp scratch register
7156 */
7157 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7158 assert_different_registers(crc, buf, len, table, tmp, rax);
7159
7160 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7161 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7162
7163 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7164 // context for the registers used, where all instructions below are using 128-bit mode
7165 // On EVEX without VL and BW, these instructions will all be AVX.
7166 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7167 notl(crc); // ~crc
7168 cmpl(len, 16);
7169 jcc(Assembler::less, L_tail);
7170
7171 // Align buffer to 16 bytes
7172 movl(tmp, buf);
7173 andl(tmp, 0xF);
7174 jccb(Assembler::zero, L_aligned);
7175 subl(tmp, 16);
7176 addl(len, tmp);
7177
7178 align(4);
7179 BIND(L_align_loop);
7180 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7181 update_byte_crc32(crc, rax, table);
7182 increment(buf);
7183 incrementl(tmp);
7184 jccb(Assembler::less, L_align_loop);
7185
7186 BIND(L_aligned);
7187 movl(tmp, len); // save
7188 shrl(len, 4);
7189 jcc(Assembler::zero, L_tail_restore);
7190
7191 // Fold crc into first bytes of vector
7192 movdqa(xmm1, Address(buf, 0));
7193 movdl(rax, xmm1);
7194 xorl(crc, rax);
7195 if (VM_Version::supports_sse4_1()) {
7196 pinsrd(xmm1, crc, 0);
7197 } else {
7198 pinsrw(xmm1, crc, 0);
7199 shrl(crc, 16);
7200 pinsrw(xmm1, crc, 1);
7201 }
7202 addptr(buf, 16);
7203 subl(len, 4); // len > 0
7204 jcc(Assembler::less, L_fold_tail);
7205
7206 movdqa(xmm2, Address(buf, 0));
7207 movdqa(xmm3, Address(buf, 16));
7208 movdqa(xmm4, Address(buf, 32));
7209 addptr(buf, 48);
7210 subl(len, 3);
7211 jcc(Assembler::lessEqual, L_fold_512b);
7212
7213 // Fold total 512 bits of polynomial on each iteration,
7214 // 128 bits per each of 4 parallel streams.
7215 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7216
7217 align32();
7218 BIND(L_fold_512b_loop);
7219 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7220 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7221 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7222 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7223 addptr(buf, 64);
7224 subl(len, 4);
7225 jcc(Assembler::greater, L_fold_512b_loop);
7226
7227 // Fold 512 bits to 128 bits.
7228 BIND(L_fold_512b);
7229 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7230 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7231 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7232 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7233
7234 // Fold the rest of 128 bits data chunks
7235 BIND(L_fold_tail);
7236 addl(len, 3);
7237 jccb(Assembler::lessEqual, L_fold_128b);
7238 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7239
7240 BIND(L_fold_tail_loop);
7241 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7242 addptr(buf, 16);
7243 decrementl(len);
7244 jccb(Assembler::greater, L_fold_tail_loop);
7245
7246 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7247 BIND(L_fold_128b);
7248 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7249 if (UseAVX > 0) {
7250 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7251 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7252 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7253 } else {
7254 movdqa(xmm2, xmm0);
7255 pclmulqdq(xmm2, xmm1, 0x1);
7256 movdqa(xmm3, xmm0);
7257 pand(xmm3, xmm2);
7258 pclmulqdq(xmm0, xmm3, 0x1);
7259 }
7260 psrldq(xmm1, 8);
7261 psrldq(xmm2, 4);
7262 pxor(xmm0, xmm1);
7263 pxor(xmm0, xmm2);
7264
7265 // 8 8-bit folds to compute 32-bit CRC.
7266 for (int j = 0; j < 4; j++) {
7267 fold_8bit_crc32(xmm0, table, xmm1, rax);
7268 }
7269 movdl(crc, xmm0); // mov 32 bits to general register
7270 for (int j = 0; j < 4; j++) {
7271 fold_8bit_crc32(crc, table, rax);
7272 }
7273
7274 BIND(L_tail_restore);
7275 movl(len, tmp); // restore
7276 BIND(L_tail);
7277 andl(len, 0xf);
7278 jccb(Assembler::zero, L_exit);
7279
7280 // Fold the rest of bytes
7281 align(4);
7282 BIND(L_tail_loop);
7283 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7284 update_byte_crc32(crc, rax, table);
7285 increment(buf);
7286 decrementl(len);
7287 jccb(Assembler::greater, L_tail_loop);
7288
7289 BIND(L_exit);
7290 notl(crc); // ~c
7291 }
7292
7293 #ifdef _LP64
7294 // Helper function for AVX 512 CRC32
7295 // Fold 512-bit data chunks
7296 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7297 Register pos, int offset) {
7298 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7299 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7300 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7301 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7302 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7303 }
7304
7305 // Helper function for AVX 512 CRC32
7306 // Compute CRC32 for < 256B buffers
7307 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7308 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7309 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7310
7311 Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7312 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7313 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7314
7315 // check if there is enough buffer to be able to fold 16B at a time
7316 cmpl(len, 32);
7317 jcc(Assembler::less, L_less_than_32);
7318
7319 // if there is, load the constants
7320 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
7321 movdl(xmm0, crc); // get the initial crc value
7322 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7323 pxor(xmm7, xmm0);
7324
7325 // update the buffer pointer
7326 addl(pos, 16);
7327 //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7328 subl(len, 32);
7329 jmp(L_16B_reduction_loop);
7330
7331 bind(L_less_than_32);
7332 //mov initial crc to the return value. this is necessary for zero - length buffers.
7333 movl(rax, crc);
7334 testl(len, len);
7335 jcc(Assembler::equal, L_cleanup);
7336
7337 movdl(xmm0, crc); //get the initial crc value
7338
7339 cmpl(len, 16);
7340 jcc(Assembler::equal, L_exact_16_left);
7341 jcc(Assembler::less, L_less_than_16_left);
7342
7343 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7344 pxor(xmm7, xmm0); //xor the initial crc value
7345 addl(pos, 16);
7346 subl(len, 16);
7347 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
7348 jmp(L_get_last_two_xmms);
7349
7350 bind(L_less_than_16_left);
7351 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7352 pxor(xmm1, xmm1);
7353 movptr(tmp1, rsp);
7354 movdqu(Address(tmp1, 0 * 16), xmm1);
7355
7356 cmpl(len, 4);
7357 jcc(Assembler::less, L_only_less_than_4);
7358
7359 //backup the counter value
7360 movl(tmp2, len);
7361 cmpl(len, 8);
7362 jcc(Assembler::less, L_less_than_8_left);
7363
7364 //load 8 Bytes
7365 movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7366 movq(Address(tmp1, 0 * 16), rax);
7367 addptr(tmp1, 8);
7368 subl(len, 8);
7369 addl(pos, 8);
7370
7371 bind(L_less_than_8_left);
7372 cmpl(len, 4);
7373 jcc(Assembler::less, L_less_than_4_left);
7374
7375 //load 4 Bytes
7376 movl(rax, Address(buf, pos, Address::times_1, 0));
7377 movl(Address(tmp1, 0 * 16), rax);
7378 addptr(tmp1, 4);
7379 subl(len, 4);
7380 addl(pos, 4);
7381
7382 bind(L_less_than_4_left);
7383 cmpl(len, 2);
7384 jcc(Assembler::less, L_less_than_2_left);
7385
7386 // load 2 Bytes
7387 movw(rax, Address(buf, pos, Address::times_1, 0));
7388 movl(Address(tmp1, 0 * 16), rax);
7389 addptr(tmp1, 2);
7390 subl(len, 2);
7391 addl(pos, 2);
7392
7393 bind(L_less_than_2_left);
7394 cmpl(len, 1);
7395 jcc(Assembler::less, L_zero_left);
7396
7397 // load 1 Byte
7398 movb(rax, Address(buf, pos, Address::times_1, 0));
7399 movb(Address(tmp1, 0 * 16), rax);
7400
7401 bind(L_zero_left);
7402 movdqu(xmm7, Address(rsp, 0));
7403 pxor(xmm7, xmm0); //xor the initial crc value
7404
7405 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7406 movdqu(xmm0, Address(rax, tmp2));
7407 pshufb(xmm7, xmm0);
7408 jmp(L_128_done);
7409
7410 bind(L_exact_16_left);
7411 movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7412 pxor(xmm7, xmm0); //xor the initial crc value
7413 jmp(L_128_done);
7414
7415 bind(L_only_less_than_4);
7416 cmpl(len, 3);
7417 jcc(Assembler::less, L_only_less_than_3);
7418
7419 // load 3 Bytes
7420 movb(rax, Address(buf, pos, Address::times_1, 0));
7421 movb(Address(tmp1, 0), rax);
7422
7423 movb(rax, Address(buf, pos, Address::times_1, 1));
7424 movb(Address(tmp1, 1), rax);
7425
7426 movb(rax, Address(buf, pos, Address::times_1, 2));
7427 movb(Address(tmp1, 2), rax);
7428
7429 movdqu(xmm7, Address(rsp, 0));
7430 pxor(xmm7, xmm0); //xor the initial crc value
7431
7432 pslldq(xmm7, 0x5);
7433 jmp(L_barrett);
7434 bind(L_only_less_than_3);
7435 cmpl(len, 2);
7436 jcc(Assembler::less, L_only_less_than_2);
7437
7438 // load 2 Bytes
7439 movb(rax, Address(buf, pos, Address::times_1, 0));
7440 movb(Address(tmp1, 0), rax);
7441
7442 movb(rax, Address(buf, pos, Address::times_1, 1));
7443 movb(Address(tmp1, 1), rax);
7444
7445 movdqu(xmm7, Address(rsp, 0));
7446 pxor(xmm7, xmm0); //xor the initial crc value
7447
7448 pslldq(xmm7, 0x6);
7449 jmp(L_barrett);
7450
7451 bind(L_only_less_than_2);
7452 //load 1 Byte
7453 movb(rax, Address(buf, pos, Address::times_1, 0));
7454 movb(Address(tmp1, 0), rax);
7455
7456 movdqu(xmm7, Address(rsp, 0));
7457 pxor(xmm7, xmm0); //xor the initial crc value
7458
7459 pslldq(xmm7, 0x7);
7460 }
7461
7462 /**
7463 * Compute CRC32 using AVX512 instructions
7464 * param crc register containing existing CRC (32-bit)
7465 * param buf register pointing to input byte buffer (byte*)
7466 * param len register containing number of bytes
7467 * param table address of crc or crc32c table
7468 * param tmp1 scratch register
7469 * param tmp2 scratch register
7470 * return rax result register
7471 *
7472 * This routine is identical for crc32c with the exception of the precomputed constant
7473 * table which will be passed as the table argument. The calculation steps are
7474 * the same for both variants.
7475 */
7476 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7477 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7478
7479 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7480 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7481 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7482 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7483 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7484
7485 const Register pos = r12;
7486 push(r12);
7487 subptr(rsp, 16 * 2 + 8);
7488
7489 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7490 // context for the registers used, where all instructions below are using 128-bit mode
7491 // On EVEX without VL and BW, these instructions will all be AVX.
7492 movl(pos, 0);
7493
7494 // check if smaller than 256B
7495 cmpl(len, 256);
7496 jcc(Assembler::less, L_less_than_256);
7497
7498 // load the initial crc value
7499 movdl(xmm10, crc);
7500
7501 // receive the initial 64B data, xor the initial crc value
7502 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7503 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7504 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7505 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7506
7507 subl(len, 256);
7508 cmpl(len, 256);
7509 jcc(Assembler::less, L_fold_128_B_loop);
7510
7511 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7512 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7513 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7514 subl(len, 256);
7515
7516 bind(L_fold_256_B_loop);
7517 addl(pos, 256);
7518 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7519 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7520 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7521 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7522
7523 subl(len, 256);
7524 jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7525
7526 // Fold 256 into 128
7527 addl(pos, 256);
7528 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7529 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7530 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7531
7532 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7533 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7534 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7535
7536 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7537 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7538
7539 addl(len, 128);
7540 jmp(L_fold_128_B_register);
7541
7542 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7543 // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7544
7545 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7546 bind(L_fold_128_B_loop);
7547 addl(pos, 128);
7548 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7549 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7550
7551 subl(len, 128);
7552 jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7553
7554 addl(pos, 128);
7555
7556 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7557 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7558 bind(L_fold_128_B_register);
7559 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7560 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7561 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7562 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7563 // save last that has no multiplicand
7564 vextracti64x2(xmm7, xmm4, 3);
7565
7566 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7567 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7568 // Needed later in reduction loop
7569 movdqu(xmm10, Address(table, 1 * 16));
7570 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7571 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7572
7573 // Swap 1,0,3,2 - 01 00 11 10
7574 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7575 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7576 vextracti128(xmm5, xmm8, 1);
7577 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7578
7579 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7580 // instead of a cmp instruction, we use the negative flag with the jl instruction
7581 addl(len, 128 - 16);
7582 jcc(Assembler::less, L_final_reduction_for_128);
7583
7584 bind(L_16B_reduction_loop);
7585 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7586 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7587 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7588 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7589 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7590 addl(pos, 16);
7591 subl(len, 16);
7592 jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7593
7594 bind(L_final_reduction_for_128);
7595 addl(len, 16);
7596 jcc(Assembler::equal, L_128_done);
7597
7598 bind(L_get_last_two_xmms);
7599 movdqu(xmm2, xmm7);
7600 addl(pos, len);
7601 movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7602 subl(pos, len);
7603
7604 // get rid of the extra data that was loaded before
7605 // load the shift constant
7606 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7607 movdqu(xmm0, Address(rax, len));
7608 addl(rax, len);
7609
7610 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7611 //Change mask to 512
7612 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7613 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7614
7615 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7616 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7617 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7618 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7619 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7620
7621 bind(L_128_done);
7622 // compute crc of a 128-bit value
7623 movdqu(xmm10, Address(table, 3 * 16));
7624 movdqu(xmm0, xmm7);
7625
7626 // 64b fold
7627 vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7628 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7629 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7630
7631 // 32b fold
7632 movdqu(xmm0, xmm7);
7633 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7634 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7635 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7636 jmp(L_barrett);
7637
7638 bind(L_less_than_256);
7639 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7640
7641 //barrett reduction
7642 bind(L_barrett);
7643 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7644 movdqu(xmm1, xmm7);
7645 movdqu(xmm2, xmm7);
7646 movdqu(xmm10, Address(table, 4 * 16));
7647
7648 pclmulqdq(xmm7, xmm10, 0x0);
7649 pxor(xmm7, xmm2);
7650 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7651 movdqu(xmm2, xmm7);
7652 pclmulqdq(xmm7, xmm10, 0x10);
7653 pxor(xmm7, xmm2);
7654 pxor(xmm7, xmm1);
7655 pextrd(crc, xmm7, 2);
7656
7657 bind(L_cleanup);
7658 addptr(rsp, 16 * 2 + 8);
7659 pop(r12);
7660 }
7661
7662 // S. Gueron / Information Processing Letters 112 (2012) 184
7663 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7664 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7665 // Output: the 64-bit carry-less product of B * CONST
7666 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7667 Register tmp1, Register tmp2, Register tmp3) {
7668 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7669 if (n > 0) {
7670 addq(tmp3, n * 256 * 8);
7671 }
7672 // Q1 = TABLEExt[n][B & 0xFF];
7673 movl(tmp1, in);
7674 andl(tmp1, 0x000000FF);
7675 shll(tmp1, 3);
7676 addq(tmp1, tmp3);
7677 movq(tmp1, Address(tmp1, 0));
7678
7679 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
7680 movl(tmp2, in);
7681 shrl(tmp2, 8);
7682 andl(tmp2, 0x000000FF);
7683 shll(tmp2, 3);
7684 addq(tmp2, tmp3);
7685 movq(tmp2, Address(tmp2, 0));
7686
7687 shlq(tmp2, 8);
7688 xorq(tmp1, tmp2);
7689
7690 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
7691 movl(tmp2, in);
7692 shrl(tmp2, 16);
7693 andl(tmp2, 0x000000FF);
7694 shll(tmp2, 3);
7695 addq(tmp2, tmp3);
7696 movq(tmp2, Address(tmp2, 0));
7697
7698 shlq(tmp2, 16);
7699 xorq(tmp1, tmp2);
7700
7701 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
7702 shrl(in, 24);
7703 andl(in, 0x000000FF);
7704 shll(in, 3);
7705 addq(in, tmp3);
7706 movq(in, Address(in, 0));
7707
7708 shlq(in, 24);
7709 xorq(in, tmp1);
7710 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7711 }
7712
7713 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7714 Register in_out,
7715 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7716 XMMRegister w_xtmp2,
7717 Register tmp1,
7718 Register n_tmp2, Register n_tmp3) {
7719 if (is_pclmulqdq_supported) {
7720 movdl(w_xtmp1, in_out); // modified blindly
7721
7722 movl(tmp1, const_or_pre_comp_const_index);
7723 movdl(w_xtmp2, tmp1);
7724 pclmulqdq(w_xtmp1, w_xtmp2, 0);
7725
7726 movdq(in_out, w_xtmp1);
7727 } else {
7728 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7729 }
7730 }
7731
7732 // Recombination Alternative 2: No bit-reflections
7733 // T1 = (CRC_A * U1) << 1
7734 // T2 = (CRC_B * U2) << 1
7735 // C1 = T1 >> 32
7736 // C2 = T2 >> 32
7737 // T1 = T1 & 0xFFFFFFFF
7738 // T2 = T2 & 0xFFFFFFFF
7739 // T1 = CRC32(0, T1)
7740 // T2 = CRC32(0, T2)
7741 // C1 = C1 ^ T1
7742 // C2 = C2 ^ T2
7743 // CRC = C1 ^ C2 ^ CRC_C
7744 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7745 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7746 Register tmp1, Register tmp2,
7747 Register n_tmp3) {
7748 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7749 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7750 shlq(in_out, 1);
7751 movl(tmp1, in_out);
7752 shrq(in_out, 32);
7753 xorl(tmp2, tmp2);
7754 crc32(tmp2, tmp1, 4);
7755 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7756 shlq(in1, 1);
7757 movl(tmp1, in1);
7758 shrq(in1, 32);
7759 xorl(tmp2, tmp2);
7760 crc32(tmp2, tmp1, 4);
7761 xorl(in1, tmp2);
7762 xorl(in_out, in1);
7763 xorl(in_out, in2);
7764 }
7765
7766 // Set N to predefined value
7767 // Subtract from a lenght of a buffer
7768 // execute in a loop:
7769 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7770 // for i = 1 to N do
7771 // CRC_A = CRC32(CRC_A, A[i])
7772 // CRC_B = CRC32(CRC_B, B[i])
7773 // CRC_C = CRC32(CRC_C, C[i])
7774 // end for
7775 // Recombine
7776 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7777 Register in_out1, Register in_out2, Register in_out3,
7778 Register tmp1, Register tmp2, Register tmp3,
7779 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7780 Register tmp4, Register tmp5,
7781 Register n_tmp6) {
7782 Label L_processPartitions;
7783 Label L_processPartition;
7784 Label L_exit;
7785
7786 bind(L_processPartitions);
7787 cmpl(in_out1, 3 * size);
7788 jcc(Assembler::less, L_exit);
7789 xorl(tmp1, tmp1);
7790 xorl(tmp2, tmp2);
7791 movq(tmp3, in_out2);
7792 addq(tmp3, size);
7793
7794 bind(L_processPartition);
7795 crc32(in_out3, Address(in_out2, 0), 8);
7796 crc32(tmp1, Address(in_out2, size), 8);
7797 crc32(tmp2, Address(in_out2, size * 2), 8);
7798 addq(in_out2, 8);
7799 cmpq(in_out2, tmp3);
7800 jcc(Assembler::less, L_processPartition);
7801 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7802 w_xtmp1, w_xtmp2, w_xtmp3,
7803 tmp4, tmp5,
7804 n_tmp6);
7805 addq(in_out2, 2 * size);
7806 subl(in_out1, 3 * size);
7807 jmp(L_processPartitions);
7808
7809 bind(L_exit);
7810 }
7811 #else
7812 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7813 Register tmp1, Register tmp2, Register tmp3,
7814 XMMRegister xtmp1, XMMRegister xtmp2) {
7815 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7816 if (n > 0) {
7817 addl(tmp3, n * 256 * 8);
7818 }
7819 // Q1 = TABLEExt[n][B & 0xFF];
7820 movl(tmp1, in_out);
7821 andl(tmp1, 0x000000FF);
7822 shll(tmp1, 3);
7823 addl(tmp1, tmp3);
7824 movq(xtmp1, Address(tmp1, 0));
7825
7826 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
7827 movl(tmp2, in_out);
7828 shrl(tmp2, 8);
7829 andl(tmp2, 0x000000FF);
7830 shll(tmp2, 3);
7831 addl(tmp2, tmp3);
7832 movq(xtmp2, Address(tmp2, 0));
7833
7834 psllq(xtmp2, 8);
7835 pxor(xtmp1, xtmp2);
7836
7837 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
7838 movl(tmp2, in_out);
7839 shrl(tmp2, 16);
7840 andl(tmp2, 0x000000FF);
7841 shll(tmp2, 3);
7842 addl(tmp2, tmp3);
7843 movq(xtmp2, Address(tmp2, 0));
7844
7845 psllq(xtmp2, 16);
7846 pxor(xtmp1, xtmp2);
7847
7848 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
7849 shrl(in_out, 24);
7850 andl(in_out, 0x000000FF);
7851 shll(in_out, 3);
7852 addl(in_out, tmp3);
7853 movq(xtmp2, Address(in_out, 0));
7854
7855 psllq(xtmp2, 24);
7856 pxor(xtmp1, xtmp2); // Result in CXMM
7857 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7858 }
7859
7860 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7861 Register in_out,
7862 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7863 XMMRegister w_xtmp2,
7864 Register tmp1,
7865 Register n_tmp2, Register n_tmp3) {
7866 if (is_pclmulqdq_supported) {
7867 movdl(w_xtmp1, in_out);
7868
7869 movl(tmp1, const_or_pre_comp_const_index);
7870 movdl(w_xtmp2, tmp1);
7871 pclmulqdq(w_xtmp1, w_xtmp2, 0);
7872 // Keep result in XMM since GPR is 32 bit in length
7873 } else {
7874 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7875 }
7876 }
7877
7878 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7879 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7880 Register tmp1, Register tmp2,
7881 Register n_tmp3) {
7882 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7883 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7884
7885 psllq(w_xtmp1, 1);
7886 movdl(tmp1, w_xtmp1);
7887 psrlq(w_xtmp1, 32);
7888 movdl(in_out, w_xtmp1);
7889
7890 xorl(tmp2, tmp2);
7891 crc32(tmp2, tmp1, 4);
7892 xorl(in_out, tmp2);
7893
7894 psllq(w_xtmp2, 1);
7895 movdl(tmp1, w_xtmp2);
7896 psrlq(w_xtmp2, 32);
7897 movdl(in1, w_xtmp2);
7898
7899 xorl(tmp2, tmp2);
7900 crc32(tmp2, tmp1, 4);
7901 xorl(in1, tmp2);
7902 xorl(in_out, in1);
7903 xorl(in_out, in2);
7904 }
7905
7906 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7907 Register in_out1, Register in_out2, Register in_out3,
7908 Register tmp1, Register tmp2, Register tmp3,
7909 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7910 Register tmp4, Register tmp5,
7911 Register n_tmp6) {
7912 Label L_processPartitions;
7913 Label L_processPartition;
7914 Label L_exit;
7915
7916 bind(L_processPartitions);
7917 cmpl(in_out1, 3 * size);
7918 jcc(Assembler::less, L_exit);
7919 xorl(tmp1, tmp1);
7920 xorl(tmp2, tmp2);
7921 movl(tmp3, in_out2);
7922 addl(tmp3, size);
7923
7924 bind(L_processPartition);
7925 crc32(in_out3, Address(in_out2, 0), 4);
7926 crc32(tmp1, Address(in_out2, size), 4);
7927 crc32(tmp2, Address(in_out2, size*2), 4);
7928 crc32(in_out3, Address(in_out2, 0+4), 4);
7929 crc32(tmp1, Address(in_out2, size+4), 4);
7930 crc32(tmp2, Address(in_out2, size*2+4), 4);
7931 addl(in_out2, 8);
7932 cmpl(in_out2, tmp3);
7933 jcc(Assembler::less, L_processPartition);
7934
7935 push(tmp3);
7936 push(in_out1);
7937 push(in_out2);
7938 tmp4 = tmp3;
7939 tmp5 = in_out1;
7940 n_tmp6 = in_out2;
7941
7942 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7943 w_xtmp1, w_xtmp2, w_xtmp3,
7944 tmp4, tmp5,
7945 n_tmp6);
7946
7947 pop(in_out2);
7948 pop(in_out1);
7949 pop(tmp3);
7950
7951 addl(in_out2, 2 * size);
7952 subl(in_out1, 3 * size);
7953 jmp(L_processPartitions);
7954
7955 bind(L_exit);
7956 }
7957 #endif //LP64
7958
7959 #ifdef _LP64
7960 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7961 // Input: A buffer I of L bytes.
7962 // Output: the CRC32C value of the buffer.
7963 // Notations:
7964 // Write L = 24N + r, with N = floor (L/24).
7965 // r = L mod 24 (0 <= r < 24).
7966 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7967 // N quadwords, and R consists of r bytes.
7968 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7969 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7970 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7971 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7972 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7973 Register tmp1, Register tmp2, Register tmp3,
7974 Register tmp4, Register tmp5, Register tmp6,
7975 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7976 bool is_pclmulqdq_supported) {
7977 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7978 Label L_wordByWord;
7979 Label L_byteByByteProlog;
7980 Label L_byteByByte;
7981 Label L_exit;
7982
7983 if (is_pclmulqdq_supported ) {
7984 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7985 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7986
7987 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7988 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7989
7990 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7991 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7992 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7993 } else {
7994 const_or_pre_comp_const_index[0] = 1;
7995 const_or_pre_comp_const_index[1] = 0;
7996
7997 const_or_pre_comp_const_index[2] = 3;
7998 const_or_pre_comp_const_index[3] = 2;
7999
8000 const_or_pre_comp_const_index[4] = 5;
8001 const_or_pre_comp_const_index[5] = 4;
8002 }
8003 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8004 in2, in1, in_out,
8005 tmp1, tmp2, tmp3,
8006 w_xtmp1, w_xtmp2, w_xtmp3,
8007 tmp4, tmp5,
8008 tmp6);
8009 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8010 in2, in1, in_out,
8011 tmp1, tmp2, tmp3,
8012 w_xtmp1, w_xtmp2, w_xtmp3,
8013 tmp4, tmp5,
8014 tmp6);
8015 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8016 in2, in1, in_out,
8017 tmp1, tmp2, tmp3,
8018 w_xtmp1, w_xtmp2, w_xtmp3,
8019 tmp4, tmp5,
8020 tmp6);
8021 movl(tmp1, in2);
8022 andl(tmp1, 0x00000007);
8023 negl(tmp1);
8024 addl(tmp1, in2);
8025 addq(tmp1, in1);
8026
8027 cmpq(in1, tmp1);
8028 jccb(Assembler::greaterEqual, L_byteByByteProlog);
8029 align(16);
8030 BIND(L_wordByWord);
8031 crc32(in_out, Address(in1, 0), 8);
8032 addq(in1, 8);
8033 cmpq(in1, tmp1);
8034 jcc(Assembler::less, L_wordByWord);
8035
8036 BIND(L_byteByByteProlog);
8037 andl(in2, 0x00000007);
8038 movl(tmp2, 1);
8039
8040 cmpl(tmp2, in2);
8041 jccb(Assembler::greater, L_exit);
8042 BIND(L_byteByByte);
8043 crc32(in_out, Address(in1, 0), 1);
8044 incq(in1);
8045 incl(tmp2);
8046 cmpl(tmp2, in2);
8047 jcc(Assembler::lessEqual, L_byteByByte);
8048
8049 BIND(L_exit);
8050 }
8051 #else
8052 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8053 Register tmp1, Register tmp2, Register tmp3,
8054 Register tmp4, Register tmp5, Register tmp6,
8055 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8056 bool is_pclmulqdq_supported) {
8057 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8058 Label L_wordByWord;
8059 Label L_byteByByteProlog;
8060 Label L_byteByByte;
8061 Label L_exit;
8062
8063 if (is_pclmulqdq_supported) {
8064 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8065 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
8066
8067 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8068 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8069
8070 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8071 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8072 } else {
8073 const_or_pre_comp_const_index[0] = 1;
8074 const_or_pre_comp_const_index[1] = 0;
8075
8076 const_or_pre_comp_const_index[2] = 3;
8077 const_or_pre_comp_const_index[3] = 2;
8078
8079 const_or_pre_comp_const_index[4] = 5;
8080 const_or_pre_comp_const_index[5] = 4;
8081 }
8082 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8083 in2, in1, in_out,
8084 tmp1, tmp2, tmp3,
8085 w_xtmp1, w_xtmp2, w_xtmp3,
8086 tmp4, tmp5,
8087 tmp6);
8088 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8089 in2, in1, in_out,
8090 tmp1, tmp2, tmp3,
8091 w_xtmp1, w_xtmp2, w_xtmp3,
8092 tmp4, tmp5,
8093 tmp6);
8094 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8095 in2, in1, in_out,
8096 tmp1, tmp2, tmp3,
8097 w_xtmp1, w_xtmp2, w_xtmp3,
8098 tmp4, tmp5,
8099 tmp6);
8100 movl(tmp1, in2);
8101 andl(tmp1, 0x00000007);
8102 negl(tmp1);
8103 addl(tmp1, in2);
8104 addl(tmp1, in1);
8105
8106 BIND(L_wordByWord);
8107 cmpl(in1, tmp1);
8108 jcc(Assembler::greaterEqual, L_byteByByteProlog);
8109 crc32(in_out, Address(in1,0), 4);
8110 addl(in1, 4);
8111 jmp(L_wordByWord);
8112
8113 BIND(L_byteByByteProlog);
8114 andl(in2, 0x00000007);
8115 movl(tmp2, 1);
8116
8117 BIND(L_byteByByte);
8118 cmpl(tmp2, in2);
8119 jccb(Assembler::greater, L_exit);
8120 movb(tmp1, Address(in1, 0));
8121 crc32(in_out, tmp1, 1);
8122 incl(in1);
8123 incl(tmp2);
8124 jmp(L_byteByByte);
8125
8126 BIND(L_exit);
8127 }
8128 #endif // LP64
8129 #undef BIND
8130 #undef BLOCK_COMMENT
8131
8132 // Compress char[] array to byte[].
8133 // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8134 // @IntrinsicCandidate
8135 // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8136 // for (int i = 0; i < len; i++) {
8137 // int c = src[srcOff++];
8138 // if (c >>> 8 != 0) {
8139 // return 0;
8140 // }
8141 // dst[dstOff++] = (byte)c;
8142 // }
8143 // return len;
8144 // }
8145 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8146 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8147 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8148 Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8149 Label copy_chars_loop, return_length, return_zero, done;
8150
8151 // rsi: src
8152 // rdi: dst
8153 // rdx: len
8154 // rcx: tmp5
8155 // rax: result
8156
8157 // rsi holds start addr of source char[] to be compressed
8158 // rdi holds start addr of destination byte[]
8159 // rdx holds length
8160
8161 assert(len != result, "");
8162
8163 // save length for return
8164 push(len);
8165
8166 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8167 VM_Version::supports_avx512vlbw() &&
8168 VM_Version::supports_bmi2()) {
8169
8170 Label copy_32_loop, copy_loop_tail, below_threshold;
8171
8172 // alignment
8173 Label post_alignment;
8174
8175 // if length of the string is less than 16, handle it in an old fashioned way
8176 testl(len, -32);
8177 jcc(Assembler::zero, below_threshold);
8178
8179 // First check whether a character is compressable ( <= 0xFF).
8180 // Create mask to test for Unicode chars inside zmm vector
8181 movl(result, 0x00FF);
8182 evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8183
8184 testl(len, -64);
8185 jcc(Assembler::zero, post_alignment);
8186
8187 movl(tmp5, dst);
8188 andl(tmp5, (32 - 1));
8189 negl(tmp5);
8190 andl(tmp5, (32 - 1));
8191
8192 // bail out when there is nothing to be done
8193 testl(tmp5, 0xFFFFFFFF);
8194 jcc(Assembler::zero, post_alignment);
8195
8196 // ~(~0 << len), where len is the # of remaining elements to process
8197 movl(result, 0xFFFFFFFF);
8198 shlxl(result, result, tmp5);
8199 notl(result);
8200 kmovdl(mask2, result);
8201
8202 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8203 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8204 ktestd(mask1, mask2);
8205 jcc(Assembler::carryClear, return_zero);
8206
8207 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8208
8209 addptr(src, tmp5);
8210 addptr(src, tmp5);
8211 addptr(dst, tmp5);
8212 subl(len, tmp5);
8213
8214 bind(post_alignment);
8215 // end of alignment
8216
8217 movl(tmp5, len);
8218 andl(tmp5, (32 - 1)); // tail count (in chars)
8219 andl(len, ~(32 - 1)); // vector count (in chars)
8220 jcc(Assembler::zero, copy_loop_tail);
8221
8222 lea(src, Address(src, len, Address::times_2));
8223 lea(dst, Address(dst, len, Address::times_1));
8224 negptr(len);
8225
8226 bind(copy_32_loop);
8227 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit);
8228 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8229 kortestdl(mask1, mask1);
8230 jcc(Assembler::carryClear, return_zero);
8231
8232 // All elements in current processed chunk are valid candidates for
8233 // compression. Write a truncated byte elements to the memory.
8234 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8235 addptr(len, 32);
8236 jcc(Assembler::notZero, copy_32_loop);
8237
8238 bind(copy_loop_tail);
8239 // bail out when there is nothing to be done
8240 testl(tmp5, 0xFFFFFFFF);
8241 jcc(Assembler::zero, return_length);
8242
8243 movl(len, tmp5);
8244
8245 // ~(~0 << len), where len is the # of remaining elements to process
8246 movl(result, 0xFFFFFFFF);
8247 shlxl(result, result, len);
8248 notl(result);
8249
8250 kmovdl(mask2, result);
8251
8252 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8253 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8254 ktestd(mask1, mask2);
8255 jcc(Assembler::carryClear, return_zero);
8256
8257 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8258 jmp(return_length);
8259
8260 bind(below_threshold);
8261 }
8262
8263 if (UseSSE42Intrinsics) {
8264 Label copy_32_loop, copy_16, copy_tail;
8265
8266 movl(result, len);
8267
8268 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
8269
8270 // vectored compression
8271 andl(len, 0xfffffff0); // vector count (in chars)
8272 andl(result, 0x0000000f); // tail count (in chars)
8273 testl(len, len);
8274 jcc(Assembler::zero, copy_16);
8275
8276 // compress 16 chars per iter
8277 movdl(tmp1Reg, tmp5);
8278 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
8279 pxor(tmp4Reg, tmp4Reg);
8280
8281 lea(src, Address(src, len, Address::times_2));
8282 lea(dst, Address(dst, len, Address::times_1));
8283 negptr(len);
8284
8285 bind(copy_32_loop);
8286 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
8287 por(tmp4Reg, tmp2Reg);
8288 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8289 por(tmp4Reg, tmp3Reg);
8290 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
8291 jcc(Assembler::notZero, return_zero);
8292 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
8293 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8294 addptr(len, 16);
8295 jcc(Assembler::notZero, copy_32_loop);
8296
8297 // compress next vector of 8 chars (if any)
8298 bind(copy_16);
8299 movl(len, result);
8300 andl(len, 0xfffffff8); // vector count (in chars)
8301 andl(result, 0x00000007); // tail count (in chars)
8302 testl(len, len);
8303 jccb(Assembler::zero, copy_tail);
8304
8305 movdl(tmp1Reg, tmp5);
8306 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
8307 pxor(tmp3Reg, tmp3Reg);
8308
8309 movdqu(tmp2Reg, Address(src, 0));
8310 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8311 jccb(Assembler::notZero, return_zero);
8312 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
8313 movq(Address(dst, 0), tmp2Reg);
8314 addptr(src, 16);
8315 addptr(dst, 8);
8316
8317 bind(copy_tail);
8318 movl(len, result);
8319 }
8320 // compress 1 char per iter
8321 testl(len, len);
8322 jccb(Assembler::zero, return_length);
8323 lea(src, Address(src, len, Address::times_2));
8324 lea(dst, Address(dst, len, Address::times_1));
8325 negptr(len);
8326
8327 bind(copy_chars_loop);
8328 load_unsigned_short(result, Address(src, len, Address::times_2));
8329 testl(result, 0xff00); // check if Unicode char
8330 jccb(Assembler::notZero, return_zero);
8331 movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte
8332 increment(len);
8333 jcc(Assembler::notZero, copy_chars_loop);
8334
8335 // if compression succeeded, return length
8336 bind(return_length);
8337 pop(result);
8338 jmpb(done);
8339
8340 // if compression failed, return 0
8341 bind(return_zero);
8342 xorl(result, result);
8343 addptr(rsp, wordSize);
8344
8345 bind(done);
8346 }
8347
8348 // Inflate byte[] array to char[].
8349 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8350 // @IntrinsicCandidate
8351 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8352 // for (int i = 0; i < len; i++) {
8353 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8354 // }
8355 // }
8356 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8357 XMMRegister tmp1, Register tmp2, KRegister mask) {
8358 Label copy_chars_loop, done, below_threshold, avx3_threshold;
8359 // rsi: src
8360 // rdi: dst
8361 // rdx: len
8362 // rcx: tmp2
8363
8364 // rsi holds start addr of source byte[] to be inflated
8365 // rdi holds start addr of destination char[]
8366 // rdx holds length
8367 assert_different_registers(src, dst, len, tmp2);
8368 movl(tmp2, len);
8369 if ((UseAVX > 2) && // AVX512
8370 VM_Version::supports_avx512vlbw() &&
8371 VM_Version::supports_bmi2()) {
8372
8373 Label copy_32_loop, copy_tail;
8374 Register tmp3_aliased = len;
8375
8376 // if length of the string is less than 16, handle it in an old fashioned way
8377 testl(len, -16);
8378 jcc(Assembler::zero, below_threshold);
8379
8380 testl(len, -1 * AVX3Threshold);
8381 jcc(Assembler::zero, avx3_threshold);
8382
8383 // In order to use only one arithmetic operation for the main loop we use
8384 // this pre-calculation
8385 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8386 andl(len, -32); // vector count
8387 jccb(Assembler::zero, copy_tail);
8388
8389 lea(src, Address(src, len, Address::times_1));
8390 lea(dst, Address(dst, len, Address::times_2));
8391 negptr(len);
8392
8393
8394 // inflate 32 chars per iter
8395 bind(copy_32_loop);
8396 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8397 evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit);
8398 addptr(len, 32);
8399 jcc(Assembler::notZero, copy_32_loop);
8400
8401 bind(copy_tail);
8402 // bail out when there is nothing to be done
8403 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8404 jcc(Assembler::zero, done);
8405
8406 // ~(~0 << length), where length is the # of remaining elements to process
8407 movl(tmp3_aliased, -1);
8408 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8409 notl(tmp3_aliased);
8410 kmovdl(mask, tmp3_aliased);
8411 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8412 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8413
8414 jmp(done);
8415 bind(avx3_threshold);
8416 }
8417 if (UseSSE42Intrinsics) {
8418 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8419
8420 if (UseAVX > 1) {
8421 andl(tmp2, (16 - 1));
8422 andl(len, -16);
8423 jccb(Assembler::zero, copy_new_tail);
8424 } else {
8425 andl(tmp2, 0x00000007); // tail count (in chars)
8426 andl(len, 0xfffffff8); // vector count (in chars)
8427 jccb(Assembler::zero, copy_tail);
8428 }
8429
8430 // vectored inflation
8431 lea(src, Address(src, len, Address::times_1));
8432 lea(dst, Address(dst, len, Address::times_2));
8433 negptr(len);
8434
8435 if (UseAVX > 1) {
8436 bind(copy_16_loop);
8437 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8438 vmovdqu(Address(dst, len, Address::times_2), tmp1);
8439 addptr(len, 16);
8440 jcc(Assembler::notZero, copy_16_loop);
8441
8442 bind(below_threshold);
8443 bind(copy_new_tail);
8444 movl(len, tmp2);
8445 andl(tmp2, 0x00000007);
8446 andl(len, 0xFFFFFFF8);
8447 jccb(Assembler::zero, copy_tail);
8448
8449 pmovzxbw(tmp1, Address(src, 0));
8450 movdqu(Address(dst, 0), tmp1);
8451 addptr(src, 8);
8452 addptr(dst, 2 * 8);
8453
8454 jmp(copy_tail, true);
8455 }
8456
8457 // inflate 8 chars per iter
8458 bind(copy_8_loop);
8459 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
8460 movdqu(Address(dst, len, Address::times_2), tmp1);
8461 addptr(len, 8);
8462 jcc(Assembler::notZero, copy_8_loop);
8463
8464 bind(copy_tail);
8465 movl(len, tmp2);
8466
8467 cmpl(len, 4);
8468 jccb(Assembler::less, copy_bytes);
8469
8470 movdl(tmp1, Address(src, 0)); // load 4 byte chars
8471 pmovzxbw(tmp1, tmp1);
8472 movq(Address(dst, 0), tmp1);
8473 subptr(len, 4);
8474 addptr(src, 4);
8475 addptr(dst, 8);
8476
8477 bind(copy_bytes);
8478 } else {
8479 bind(below_threshold);
8480 }
8481
8482 testl(len, len);
8483 jccb(Assembler::zero, done);
8484 lea(src, Address(src, len, Address::times_1));
8485 lea(dst, Address(dst, len, Address::times_2));
8486 negptr(len);
8487
8488 // inflate 1 char per iter
8489 bind(copy_chars_loop);
8490 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
8491 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
8492 increment(len);
8493 jcc(Assembler::notZero, copy_chars_loop);
8494
8495 bind(done);
8496 }
8497
8498
8499 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
8500 switch(type) {
8501 case T_BYTE:
8502 case T_BOOLEAN:
8503 evmovdqub(dst, kmask, src, false, vector_len);
8504 break;
8505 case T_CHAR:
8506 case T_SHORT:
8507 evmovdquw(dst, kmask, src, false, vector_len);
8508 break;
8509 case T_INT:
8510 case T_FLOAT:
8511 evmovdqul(dst, kmask, src, false, vector_len);
8512 break;
8513 case T_LONG:
8514 case T_DOUBLE:
8515 evmovdquq(dst, kmask, src, false, vector_len);
8516 break;
8517 default:
8518 fatal("Unexpected type argument %s", type2name(type));
8519 break;
8520 }
8521 }
8522
8523 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
8524 switch(type) {
8525 case T_BYTE:
8526 case T_BOOLEAN:
8527 evmovdqub(dst, kmask, src, true, vector_len);
8528 break;
8529 case T_CHAR:
8530 case T_SHORT:
8531 evmovdquw(dst, kmask, src, true, vector_len);
8532 break;
8533 case T_INT:
8534 case T_FLOAT:
8535 evmovdqul(dst, kmask, src, true, vector_len);
8536 break;
8537 case T_LONG:
8538 case T_DOUBLE:
8539 evmovdquq(dst, kmask, src, true, vector_len);
8540 break;
8541 default:
8542 fatal("Unexpected type argument %s", type2name(type));
8543 break;
8544 }
8545 }
8546
8547 #if COMPILER2_OR_JVMCI
8548
8549
8550 // Set memory operation for length "less than" 64 bytes.
8551 void MacroAssembler::fill64_masked_avx(uint shift, Register dst, int disp,
8552 XMMRegister xmm, KRegister mask, Register length,
8553 Register temp, bool use64byteVector) {
8554 assert(MaxVectorSize >= 32, "vector length should be >= 32");
8555 assert(shift != 0, "shift value should be 1 (short),2(int) or 3(long)");
8556 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8557 if (!use64byteVector) {
8558 fill32_avx(dst, disp, xmm);
8559 subptr(length, 32 >> shift);
8560 fill32_masked_avx(shift, dst, disp + 32, xmm, mask, length, temp);
8561 } else {
8562 assert(MaxVectorSize == 64, "vector length != 64");
8563 movl(temp, 1);
8564 shlxl(temp, temp, length);
8565 subptr(temp, 1);
8566 kmovwl(mask, temp);
8567 evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_512bit);
8568 }
8569 }
8570
8571
8572 void MacroAssembler::fill32_masked_avx(uint shift, Register dst, int disp,
8573 XMMRegister xmm, KRegister mask, Register length,
8574 Register temp) {
8575 assert(MaxVectorSize >= 32, "vector length should be >= 32");
8576 assert(shift != 0, "shift value should be 1 (short), 2(int) or 3(long)");
8577 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
8578 movl(temp, 1);
8579 shlxl(temp, temp, length);
8580 subptr(temp, 1);
8581 kmovwl(mask, temp);
8582 evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_256bit);
8583 }
8584
8585 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
8586 assert(MaxVectorSize >= 32, "vector length should be >= 32");
8587 vmovdqu(dst, xmm);
8588 }
8589
8590 void MacroAssembler::fill32_avx(Register dst, int disp, XMMRegister xmm) {
8591 fill32(Address(dst, disp), xmm);
8592 }
8593
8594 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
8595 assert(MaxVectorSize >= 32, "vector length should be >= 32");
8596 if (!use64byteVector) {
8597 fill32(dst, xmm);
8598 fill32(dst.plus_disp(32), xmm);
8599 } else {
8600 evmovdquq(dst, xmm, Assembler::AVX_512bit);
8601 }
8602 }
8603
8604 void MacroAssembler::fill64_avx(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
8605 fill64(Address(dst, disp), xmm, use64byteVector);
8606 }
8607
8608 #endif //COMPILER2_OR_JVMCI
8609
8610
8611 #ifdef _LP64
8612 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
8613 Label done;
8614 cvttss2sil(dst, src);
8615 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8616 cmpl(dst, 0x80000000); // float_sign_flip
8617 jccb(Assembler::notEqual, done);
8618 subptr(rsp, 8);
8619 movflt(Address(rsp, 0), src);
8620 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
8621 pop(dst);
8622 bind(done);
8623 }
8624
8625 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
8626 Label done;
8627 cvttsd2sil(dst, src);
8628 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8629 cmpl(dst, 0x80000000); // float_sign_flip
8630 jccb(Assembler::notEqual, done);
8631 subptr(rsp, 8);
8632 movdbl(Address(rsp, 0), src);
8633 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
8634 pop(dst);
8635 bind(done);
8636 }
8637
8638 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
8639 Label done;
8640 cvttss2siq(dst, src);
8641 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8642 jccb(Assembler::notEqual, done);
8643 subptr(rsp, 8);
8644 movflt(Address(rsp, 0), src);
8645 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
8646 pop(dst);
8647 bind(done);
8648 }
8649
8650 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
8651 Label done;
8652 cvttsd2siq(dst, src);
8653 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8654 jccb(Assembler::notEqual, done);
8655 subptr(rsp, 8);
8656 movdbl(Address(rsp, 0), src);
8657 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
8658 pop(dst);
8659 bind(done);
8660 }
8661
8662 void MacroAssembler::cache_wb(Address line)
8663 {
8664 // 64 bit cpus always support clflush
8665 assert(VM_Version::supports_clflush(), "clflush should be available");
8666 bool optimized = VM_Version::supports_clflushopt();
8667 bool no_evict = VM_Version::supports_clwb();
8668
8669 // prefer clwb (writeback without evict) otherwise
8670 // prefer clflushopt (potentially parallel writeback with evict)
8671 // otherwise fallback on clflush (serial writeback with evict)
8672
8673 if (optimized) {
8674 if (no_evict) {
8675 clwb(line);
8676 } else {
8677 clflushopt(line);
8678 }
8679 } else {
8680 // no need for fence when using CLFLUSH
8681 clflush(line);
8682 }
8683 }
8684
8685 void MacroAssembler::cache_wbsync(bool is_pre)
8686 {
8687 assert(VM_Version::supports_clflush(), "clflush should be available");
8688 bool optimized = VM_Version::supports_clflushopt();
8689 bool no_evict = VM_Version::supports_clwb();
8690
8691 // pick the correct implementation
8692
8693 if (!is_pre && (optimized || no_evict)) {
8694 // need an sfence for post flush when using clflushopt or clwb
8695 // otherwise no no need for any synchroniaztion
8696
8697 sfence();
8698 }
8699 }
8700
8701 #endif // _LP64
8702
8703 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
8704 switch (cond) {
8705 // Note some conditions are synonyms for others
8706 case Assembler::zero: return Assembler::notZero;
8707 case Assembler::notZero: return Assembler::zero;
8708 case Assembler::less: return Assembler::greaterEqual;
8709 case Assembler::lessEqual: return Assembler::greater;
8710 case Assembler::greater: return Assembler::lessEqual;
8711 case Assembler::greaterEqual: return Assembler::less;
8712 case Assembler::below: return Assembler::aboveEqual;
8713 case Assembler::belowEqual: return Assembler::above;
8714 case Assembler::above: return Assembler::belowEqual;
8715 case Assembler::aboveEqual: return Assembler::below;
8716 case Assembler::overflow: return Assembler::noOverflow;
8717 case Assembler::noOverflow: return Assembler::overflow;
8718 case Assembler::negative: return Assembler::positive;
8719 case Assembler::positive: return Assembler::negative;
8720 case Assembler::parity: return Assembler::noParity;
8721 case Assembler::noParity: return Assembler::parity;
8722 }
8723 ShouldNotReachHere(); return Assembler::overflow;
8724 }
8725
8726 SkipIfEqual::SkipIfEqual(
8727 MacroAssembler* masm, const bool* flag_addr, bool value) {
8728 _masm = masm;
8729 _masm->cmp8(ExternalAddress((address)flag_addr), value);
8730 _masm->jcc(Assembler::equal, _label);
8731 }
8732
8733 SkipIfEqual::~SkipIfEqual() {
8734 _masm->bind(_label);
8735 }
8736
8737 // 32-bit Windows has its own fast-path implementation
8738 // of get_thread
8739 #if !defined(WIN32) || defined(_LP64)
8740
8741 // This is simply a call to Thread::current()
8742 void MacroAssembler::get_thread(Register thread) {
8743 if (thread != rax) {
8744 push(rax);
8745 }
8746 LP64_ONLY(push(rdi);)
8747 LP64_ONLY(push(rsi);)
8748 push(rdx);
8749 push(rcx);
8750 #ifdef _LP64
8751 push(r8);
8752 push(r9);
8753 push(r10);
8754 push(r11);
8755 #endif
8756
8757 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
8758
8759 #ifdef _LP64
8760 pop(r11);
8761 pop(r10);
8762 pop(r9);
8763 pop(r8);
8764 #endif
8765 pop(rcx);
8766 pop(rdx);
8767 LP64_ONLY(pop(rsi);)
8768 LP64_ONLY(pop(rdi);)
8769 if (thread != rax) {
8770 mov(thread, rax);
8771 pop(rax);
8772 }
8773 }
8774
8775 #endif // !WIN32 || _LP64
8776
8777 // Implements lightweight-locking.
8778 //
8779 // obj: the object to be locked
8780 // reg_rax: rax
8781 // thread: the thread which attempts to lock obj
8782 // tmp: a temporary register
8783 void MacroAssembler::lightweight_lock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) {
8784 assert(reg_rax == rax, "");
8785 assert_different_registers(obj, reg_rax, thread, tmp);
8786
8787 Label push;
8788 const Register top = tmp;
8789
8790 // Preload the markWord. It is important that this is the first
8791 // instruction emitted as it is part of C1's null check semantics.
8792 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
8793
8794 // Load top.
8795 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
8796
8797 // Check if the lock-stack is full.
8798 cmpl(top, LockStack::end_offset());
8799 jcc(Assembler::greaterEqual, slow);
8800
8801 // Check for recursion.
8802 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
8803 jcc(Assembler::equal, push);
8804
8805 // Check header for monitor (0b10).
8806 testptr(reg_rax, markWord::monitor_value);
8807 jcc(Assembler::notZero, slow);
8808
8809 // Try to lock. Transition lock bits 0b01 => 0b00
8810 movptr(tmp, reg_rax);
8811 andptr(tmp, ~(int32_t)markWord::unlocked_value);
8812 orptr(reg_rax, markWord::unlocked_value);
8813 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
8814 jcc(Assembler::notEqual, slow);
8815
8816 // Restore top, CAS clobbers register.
8817 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
8818
8819 bind(push);
8820 // After successful lock, push object on lock-stack.
8821 movptr(Address(thread, top), obj);
8822 incrementl(top, oopSize);
8823 movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
8824 }
8825
8826 // Implements lightweight-unlocking.
8827 //
8828 // obj: the object to be unlocked
8829 // reg_rax: rax
8830 // thread: the thread
8831 // tmp: a temporary register
8832 //
8833 // x86_32 Note: reg_rax and thread may alias each other due to limited register
8834 // availiability.
8835 void MacroAssembler::lightweight_unlock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) {
8836 assert(reg_rax == rax, "");
8837 assert_different_registers(obj, reg_rax, tmp);
8838 LP64_ONLY(assert_different_registers(obj, reg_rax, thread, tmp);)
8839
8840 Label unlocked, push_and_slow;
8841 const Register top = tmp;
8842
8843 // Check if obj is top of lock-stack.
8844 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
8845 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
8846 jcc(Assembler::notEqual, slow);
8847
8848 // Pop lock-stack.
8849 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
8850 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
8851
8852 // Check if recursive.
8853 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
8854 jcc(Assembler::equal, unlocked);
8855
8856 // Not recursive. Check header for monitor (0b10).
8857 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
8858 testptr(reg_rax, markWord::monitor_value);
8859 jcc(Assembler::notZero, push_and_slow);
8860
8861 #ifdef ASSERT
8862 // Check header not unlocked (0b01).
8863 Label not_unlocked;
8864 testptr(reg_rax, markWord::unlocked_value);
8865 jcc(Assembler::zero, not_unlocked);
8866 stop("lightweight_unlock already unlocked");
8867 bind(not_unlocked);
8868 #endif
8869
8870 // Try to unlock. Transition lock bits 0b00 => 0b01
8871 movptr(tmp, reg_rax);
8872 orptr(tmp, markWord::unlocked_value);
8873 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
8874 jcc(Assembler::equal, unlocked);
8875
8876 bind(push_and_slow);
8877 // Restore lock-stack and handle the unlock in runtime.
8878 if (thread == reg_rax) {
8879 // On x86_32 we may lose the thread.
8880 get_thread(thread);
8881 }
8882 #ifdef ASSERT
8883 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
8884 movptr(Address(thread, top), obj);
8885 #endif
8886 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
8887 jmp(slow);
8888
8889 bind(unlocked);
8890 }