1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "code/aotCodeCache.hpp"
28 #include "code/compiledIC.hpp"
29 #include "compiler/compiler_globals.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "crc32c.h"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/barrierSetAssembler.hpp"
34 #include "gc/shared/collectedHeap.inline.hpp"
35 #include "gc/shared/tlab_globals.hpp"
36 #include "interpreter/bytecodeHistogram.hpp"
37 #include "interpreter/interpreter.hpp"
38 #include "interpreter/interpreterRuntime.hpp"
39 #include "jvm.h"
40 #include "memory/resourceArea.hpp"
41 #include "memory/universe.hpp"
42 #include "oops/accessDecorators.hpp"
43 #include "oops/compressedKlass.inline.hpp"
44 #include "oops/compressedOops.inline.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/interfaceSupport.inline.hpp"
49 #include "runtime/javaThread.hpp"
50 #include "runtime/jniHandles.hpp"
51 #include "runtime/objectMonitor.hpp"
52 #include "runtime/os.hpp"
53 #include "runtime/safepoint.hpp"
54 #include "runtime/safepointMechanism.hpp"
55 #include "runtime/sharedRuntime.hpp"
56 #include "runtime/stubRoutines.hpp"
57 #include "utilities/checkedCast.hpp"
58 #include "utilities/macros.hpp"
59
60 #ifdef PRODUCT
61 #define BLOCK_COMMENT(str) /* nothing */
62 #define STOP(error) stop(error)
63 #else
64 #define BLOCK_COMMENT(str) block_comment(str)
65 #define STOP(error) block_comment(error); stop(error)
66 #endif
67
68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
69
70 #ifdef ASSERT
71 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
72 #endif
73
74 static const Assembler::Condition reverse[] = {
75 Assembler::noOverflow /* overflow = 0x0 */ ,
76 Assembler::overflow /* noOverflow = 0x1 */ ,
77 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
78 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
79 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
80 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
81 Assembler::above /* belowEqual = 0x6 */ ,
82 Assembler::belowEqual /* above = 0x7 */ ,
83 Assembler::positive /* negative = 0x8 */ ,
84 Assembler::negative /* positive = 0x9 */ ,
85 Assembler::noParity /* parity = 0xa */ ,
86 Assembler::parity /* noParity = 0xb */ ,
87 Assembler::greaterEqual /* less = 0xc */ ,
88 Assembler::less /* greaterEqual = 0xd */ ,
89 Assembler::greater /* lessEqual = 0xe */ ,
90 Assembler::lessEqual /* greater = 0xf, */
91
92 };
93
94
95 // Implementation of MacroAssembler
96
97 Address MacroAssembler::as_Address(AddressLiteral adr) {
98 // amd64 always does this as a pc-rel
99 // we can be absolute or disp based on the instruction type
100 // jmp/call are displacements others are absolute
101 assert(!adr.is_lval(), "must be rval");
102 assert(reachable(adr), "must be");
103 return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
104
105 }
106
107 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
108 AddressLiteral base = adr.base();
109 lea(rscratch, base);
110 Address index = adr.index();
111 assert(index._disp == 0, "must not have disp"); // maybe it can?
112 Address array(rscratch, index._index, index._scale, index._disp);
113 return array;
114 }
115
116 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
117 Label L, E;
118
119 #ifdef _WIN64
120 // Windows always allocates space for it's register args
121 assert(num_args <= 4, "only register arguments supported");
122 subq(rsp, frame::arg_reg_save_area_bytes);
123 #endif
124
125 // Align stack if necessary
126 testl(rsp, 15);
127 jcc(Assembler::zero, L);
128
129 subq(rsp, 8);
130 call(RuntimeAddress(entry_point));
131 addq(rsp, 8);
132 jmp(E);
133
134 bind(L);
135 call(RuntimeAddress(entry_point));
136
137 bind(E);
138
139 #ifdef _WIN64
140 // restore stack pointer
141 addq(rsp, frame::arg_reg_save_area_bytes);
142 #endif
143 }
144
145 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
146 assert(!src2.is_lval(), "should use cmpptr");
147 assert(rscratch != noreg || always_reachable(src2), "missing");
148
149 if (reachable(src2)) {
150 cmpq(src1, as_Address(src2));
151 } else {
152 lea(rscratch, src2);
153 Assembler::cmpq(src1, Address(rscratch, 0));
154 }
155 }
156
157 int MacroAssembler::corrected_idivq(Register reg) {
158 // Full implementation of Java ldiv and lrem; checks for special
159 // case as described in JVM spec., p.243 & p.271. The function
160 // returns the (pc) offset of the idivl instruction - may be needed
161 // for implicit exceptions.
162 //
163 // normal case special case
164 //
165 // input : rax: dividend min_long
166 // reg: divisor (may not be eax/edx) -1
167 //
168 // output: rax: quotient (= rax idiv reg) min_long
169 // rdx: remainder (= rax irem reg) 0
170 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
171 static const int64_t min_long = 0x8000000000000000;
172 Label normal_case, special_case;
173
174 // check for special case
175 cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
176 jcc(Assembler::notEqual, normal_case);
177 xorl(rdx, rdx); // prepare rdx for possible special case (where
178 // remainder = 0)
179 cmpq(reg, -1);
180 jcc(Assembler::equal, special_case);
181
182 // handle normal case
183 bind(normal_case);
184 cdqq();
185 int idivq_offset = offset();
186 idivq(reg);
187
188 // normal and special case exit
189 bind(special_case);
190
191 return idivq_offset;
192 }
193
194 void MacroAssembler::decrementq(Register reg, int value) {
195 if (value == min_jint) { subq(reg, value); return; }
196 if (value < 0) { incrementq(reg, -value); return; }
197 if (value == 0) { ; return; }
198 if (value == 1 && UseIncDec) { decq(reg) ; return; }
199 /* else */ { subq(reg, value) ; return; }
200 }
201
202 void MacroAssembler::decrementq(Address dst, int value) {
203 if (value == min_jint) { subq(dst, value); return; }
204 if (value < 0) { incrementq(dst, -value); return; }
205 if (value == 0) { ; return; }
206 if (value == 1 && UseIncDec) { decq(dst) ; return; }
207 /* else */ { subq(dst, value) ; return; }
208 }
209
210 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
211 assert(rscratch != noreg || always_reachable(dst), "missing");
212
213 if (reachable(dst)) {
214 incrementq(as_Address(dst));
215 } else {
216 lea(rscratch, dst);
217 incrementq(Address(rscratch, 0));
218 }
219 }
220
221 void MacroAssembler::incrementq(Register reg, int value) {
222 if (value == min_jint) { addq(reg, value); return; }
223 if (value < 0) { decrementq(reg, -value); return; }
224 if (value == 0) { ; return; }
225 if (value == 1 && UseIncDec) { incq(reg) ; return; }
226 /* else */ { addq(reg, value) ; return; }
227 }
228
229 void MacroAssembler::incrementq(Address dst, int value) {
230 if (value == min_jint) { addq(dst, value); return; }
231 if (value < 0) { decrementq(dst, -value); return; }
232 if (value == 0) { ; return; }
233 if (value == 1 && UseIncDec) { incq(dst) ; return; }
234 /* else */ { addq(dst, value) ; return; }
235 }
236
237 // 32bit can do a case table jump in one instruction but we no longer allow the base
238 // to be installed in the Address class
239 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
240 lea(rscratch, entry.base());
241 Address dispatch = entry.index();
242 assert(dispatch._base == noreg, "must be");
243 dispatch._base = rscratch;
244 jmp(dispatch);
245 }
246
247 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
248 ShouldNotReachHere(); // 64bit doesn't use two regs
249 cmpq(x_lo, y_lo);
250 }
251
252 void MacroAssembler::lea(Register dst, AddressLiteral src) {
253 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
254 }
255
256 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
257 lea(rscratch, adr);
258 movptr(dst, rscratch);
259 }
260
261 void MacroAssembler::leave() {
262 // %%% is this really better? Why not on 32bit too?
263 emit_int8((unsigned char)0xC9); // LEAVE
264 }
265
266 void MacroAssembler::lneg(Register hi, Register lo) {
267 ShouldNotReachHere(); // 64bit doesn't use two regs
268 negq(lo);
269 }
270
271 void MacroAssembler::movoop(Register dst, jobject obj) {
272 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
273 }
274
275 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
276 mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
277 movq(dst, rscratch);
278 }
279
280 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
281 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
282 }
283
284 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
285 mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
286 movq(dst, rscratch);
287 }
288
289 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
290 if (src.is_lval()) {
291 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
292 } else {
293 if (reachable(src)) {
294 movq(dst, as_Address(src));
295 } else {
296 lea(dst, src);
297 movq(dst, Address(dst, 0));
298 }
299 }
300 }
301
302 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
303 movq(as_Address(dst, rscratch), src);
304 }
305
306 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
307 movq(dst, as_Address(src, dst /*rscratch*/));
308 }
309
310 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
311 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
312 if (is_simm32(src)) {
313 movptr(dst, checked_cast<int32_t>(src));
314 } else {
315 mov64(rscratch, src);
316 movq(dst, rscratch);
317 }
318 }
319
320 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
321 movoop(rscratch, obj);
322 push(rscratch);
323 }
324
325 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
326 mov_metadata(rscratch, obj);
327 push(rscratch);
328 }
329
330 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
331 lea(rscratch, src);
332 if (src.is_lval()) {
333 push(rscratch);
334 } else {
335 pushq(Address(rscratch, 0));
336 }
337 }
338
339 static void pass_arg0(MacroAssembler* masm, Register arg) {
340 if (c_rarg0 != arg ) {
341 masm->mov(c_rarg0, arg);
342 }
343 }
344
345 static void pass_arg1(MacroAssembler* masm, Register arg) {
346 if (c_rarg1 != arg ) {
347 masm->mov(c_rarg1, arg);
348 }
349 }
350
351 static void pass_arg2(MacroAssembler* masm, Register arg) {
352 if (c_rarg2 != arg ) {
353 masm->mov(c_rarg2, arg);
354 }
355 }
356
357 static void pass_arg3(MacroAssembler* masm, Register arg) {
358 if (c_rarg3 != arg ) {
359 masm->mov(c_rarg3, arg);
360 }
361 }
362
363 void MacroAssembler::stop(const char* msg) {
364 if (ShowMessageBoxOnError) {
365 address rip = pc();
366 pusha(); // get regs on stack
367 lea(c_rarg1, InternalAddress(rip));
368 movq(c_rarg2, rsp); // pass pointer to regs array
369 }
370 // Skip AOT caching C strings in scratch buffer.
371 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
372 lea(c_rarg0, ExternalAddress((address) str));
373 andq(rsp, -16); // align stack as required by ABI
374 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
375 hlt();
376 }
377
378 void MacroAssembler::warn(const char* msg) {
379 push(rbp);
380 movq(rbp, rsp);
381 andq(rsp, -16); // align stack as required by push_CPU_state and call
382 push_CPU_state(); // keeps alignment at 16 bytes
383
384 #ifdef _WIN64
385 // Windows always allocates space for its register args
386 subq(rsp, frame::arg_reg_save_area_bytes);
387 #endif
388 lea(c_rarg0, ExternalAddress((address) msg));
389 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
390
391 #ifdef _WIN64
392 // restore stack pointer
393 addq(rsp, frame::arg_reg_save_area_bytes);
394 #endif
395 pop_CPU_state();
396 mov(rsp, rbp);
397 pop(rbp);
398 }
399
400 void MacroAssembler::print_state() {
401 address rip = pc();
402 pusha(); // get regs on stack
403 push(rbp);
404 movq(rbp, rsp);
405 andq(rsp, -16); // align stack as required by push_CPU_state and call
406 push_CPU_state(); // keeps alignment at 16 bytes
407
408 lea(c_rarg0, InternalAddress(rip));
409 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
410 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
411
412 pop_CPU_state();
413 mov(rsp, rbp);
414 pop(rbp);
415 popa();
416 }
417
418 #ifndef PRODUCT
419 extern "C" void findpc(intptr_t x);
420 #endif
421
422 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
423 // In order to get locks to work, we need to fake a in_VM state
424 if (ShowMessageBoxOnError) {
425 JavaThread* thread = JavaThread::current();
426 JavaThreadState saved_state = thread->thread_state();
427 thread->set_thread_state(_thread_in_vm);
428 #ifndef PRODUCT
429 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
430 ttyLocker ttyl;
431 BytecodeCounter::print();
432 }
433 #endif
434 // To see where a verify_oop failed, get $ebx+40/X for this frame.
435 // XXX correct this offset for amd64
436 // This is the value of eip which points to where verify_oop will return.
437 if (os::message_box(msg, "Execution stopped, print registers?")) {
438 print_state64(pc, regs);
439 BREAKPOINT;
440 }
441 }
442 fatal("DEBUG MESSAGE: %s", msg);
443 }
444
445 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
446 ttyLocker ttyl;
447 DebuggingContext debugging{};
448 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
449 #ifndef PRODUCT
450 tty->cr();
451 findpc(pc);
452 tty->cr();
453 #endif
454 #define PRINT_REG(rax, value) \
455 { tty->print("%s = ", #rax); os::print_location(tty, value); }
456 PRINT_REG(rax, regs[15]);
457 PRINT_REG(rbx, regs[12]);
458 PRINT_REG(rcx, regs[14]);
459 PRINT_REG(rdx, regs[13]);
460 PRINT_REG(rdi, regs[8]);
461 PRINT_REG(rsi, regs[9]);
462 PRINT_REG(rbp, regs[10]);
463 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
464 PRINT_REG(rsp, (intptr_t)(®s[16]));
465 PRINT_REG(r8 , regs[7]);
466 PRINT_REG(r9 , regs[6]);
467 PRINT_REG(r10, regs[5]);
468 PRINT_REG(r11, regs[4]);
469 PRINT_REG(r12, regs[3]);
470 PRINT_REG(r13, regs[2]);
471 PRINT_REG(r14, regs[1]);
472 PRINT_REG(r15, regs[0]);
473 #undef PRINT_REG
474 // Print some words near the top of the stack.
475 int64_t* rsp = ®s[16];
476 int64_t* dump_sp = rsp;
477 for (int col1 = 0; col1 < 8; col1++) {
478 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
479 os::print_location(tty, *dump_sp++);
480 }
481 for (int row = 0; row < 25; row++) {
482 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
483 for (int col = 0; col < 4; col++) {
484 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
485 }
486 tty->cr();
487 }
488 // Print some instructions around pc:
489 Disassembler::decode((address)pc-64, (address)pc);
490 tty->print_cr("--------");
491 Disassembler::decode((address)pc, (address)pc+32);
492 }
493
494 // The java_calling_convention describes stack locations as ideal slots on
495 // a frame with no abi restrictions. Since we must observe abi restrictions
496 // (like the placement of the register window) the slots must be biased by
497 // the following value.
498 static int reg2offset_in(VMReg r) {
499 // Account for saved rbp and return address
500 // This should really be in_preserve_stack_slots
501 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
502 }
503
504 static int reg2offset_out(VMReg r) {
505 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
506 }
507
508 // A long move
509 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
510
511 // The calling conventions assures us that each VMregpair is either
512 // all really one physical register or adjacent stack slots.
513
514 if (src.is_single_phys_reg() ) {
515 if (dst.is_single_phys_reg()) {
516 if (dst.first() != src.first()) {
517 mov(dst.first()->as_Register(), src.first()->as_Register());
518 }
519 } else {
520 assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
521 src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
522 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
523 }
524 } else if (dst.is_single_phys_reg()) {
525 assert(src.is_single_reg(), "not a stack pair");
526 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
527 } else {
528 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
529 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
530 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
531 }
532 }
533
534 // A double move
535 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
536
537 // The calling conventions assures us that each VMregpair is either
538 // all really one physical register or adjacent stack slots.
539
540 if (src.is_single_phys_reg() ) {
541 if (dst.is_single_phys_reg()) {
542 // In theory these overlap but the ordering is such that this is likely a nop
543 if ( src.first() != dst.first()) {
544 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
545 }
546 } else {
547 assert(dst.is_single_reg(), "not a stack pair");
548 movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
549 }
550 } else if (dst.is_single_phys_reg()) {
551 assert(src.is_single_reg(), "not a stack pair");
552 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
553 } else {
554 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
555 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
556 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
557 }
558 }
559
560
561 // A float arg may have to do float reg int reg conversion
562 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
563 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
564
565 // The calling conventions assures us that each VMregpair is either
566 // all really one physical register or adjacent stack slots.
567
568 if (src.first()->is_stack()) {
569 if (dst.first()->is_stack()) {
570 movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
571 movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
572 } else {
573 // stack to reg
574 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
575 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
576 }
577 } else if (dst.first()->is_stack()) {
578 // reg to stack
579 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
580 movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
581 } else {
582 // reg to reg
583 // In theory these overlap but the ordering is such that this is likely a nop
584 if ( src.first() != dst.first()) {
585 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
586 }
587 }
588 }
589
590 // On 64 bit we will store integer like items to the stack as
591 // 64 bits items (x86_32/64 abi) even though java would only store
592 // 32bits for a parameter. On 32bit it will simply be 32 bits
593 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
594 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
595 if (src.first()->is_stack()) {
596 if (dst.first()->is_stack()) {
597 // stack to stack
598 movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
599 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
600 } else {
601 // stack to reg
602 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
603 }
604 } else if (dst.first()->is_stack()) {
605 // reg to stack
606 // Do we really have to sign extend???
607 // __ movslq(src.first()->as_Register(), src.first()->as_Register());
608 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
609 } else {
610 // Do we really have to sign extend???
611 // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
612 if (dst.first() != src.first()) {
613 movq(dst.first()->as_Register(), src.first()->as_Register());
614 }
615 }
616 }
617
618 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
619 if (src.first()->is_stack()) {
620 if (dst.first()->is_stack()) {
621 // stack to stack
622 movq(rax, Address(rbp, reg2offset_in(src.first())));
623 movq(Address(rsp, reg2offset_out(dst.first())), rax);
624 } else {
625 // stack to reg
626 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
627 }
628 } else if (dst.first()->is_stack()) {
629 // reg to stack
630 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
631 } else {
632 if (dst.first() != src.first()) {
633 movq(dst.first()->as_Register(), src.first()->as_Register());
634 }
635 }
636 }
637
638 // An oop arg. Must pass a handle not the oop itself
639 void MacroAssembler::object_move(OopMap* map,
640 int oop_handle_offset,
641 int framesize_in_slots,
642 VMRegPair src,
643 VMRegPair dst,
644 bool is_receiver,
645 int* receiver_offset) {
646
647 // must pass a handle. First figure out the location we use as a handle
648
649 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
650
651 // See if oop is null if it is we need no handle
652
653 if (src.first()->is_stack()) {
654
655 // Oop is already on the stack as an argument
656 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
657 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
658 if (is_receiver) {
659 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
660 }
661
662 cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
663 lea(rHandle, Address(rbp, reg2offset_in(src.first())));
664 // conditionally move a null
665 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
666 } else {
667
668 // Oop is in a register we must store it to the space we reserve
669 // on the stack for oop_handles and pass a handle if oop is non-null
670
671 const Register rOop = src.first()->as_Register();
672 int oop_slot;
673 if (rOop == j_rarg0)
674 oop_slot = 0;
675 else if (rOop == j_rarg1)
676 oop_slot = 1;
677 else if (rOop == j_rarg2)
678 oop_slot = 2;
679 else if (rOop == j_rarg3)
680 oop_slot = 3;
681 else if (rOop == j_rarg4)
682 oop_slot = 4;
683 else {
684 assert(rOop == j_rarg5, "wrong register");
685 oop_slot = 5;
686 }
687
688 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
689 int offset = oop_slot*VMRegImpl::stack_slot_size;
690
691 map->set_oop(VMRegImpl::stack2reg(oop_slot));
692 // Store oop in handle area, may be null
693 movptr(Address(rsp, offset), rOop);
694 if (is_receiver) {
695 *receiver_offset = offset;
696 }
697
698 cmpptr(rOop, NULL_WORD);
699 lea(rHandle, Address(rsp, offset));
700 // conditionally move a null from the handle area where it was just stored
701 cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
702 }
703
704 // If arg is on the stack then place it otherwise it is already in correct reg.
705 if (dst.first()->is_stack()) {
706 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
707 }
708 }
709
710 void MacroAssembler::addptr(Register dst, int32_t imm32) {
711 addq(dst, imm32);
712 }
713
714 void MacroAssembler::addptr(Register dst, Register src) {
715 addq(dst, src);
716 }
717
718 void MacroAssembler::addptr(Address dst, Register src) {
719 addq(dst, src);
720 }
721
722 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
723 assert(rscratch != noreg || always_reachable(src), "missing");
724
725 if (reachable(src)) {
726 Assembler::addsd(dst, as_Address(src));
727 } else {
728 lea(rscratch, src);
729 Assembler::addsd(dst, Address(rscratch, 0));
730 }
731 }
732
733 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
734 assert(rscratch != noreg || always_reachable(src), "missing");
735
736 if (reachable(src)) {
737 addss(dst, as_Address(src));
738 } else {
739 lea(rscratch, src);
740 addss(dst, Address(rscratch, 0));
741 }
742 }
743
744 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
745 assert(rscratch != noreg || always_reachable(src), "missing");
746
747 if (reachable(src)) {
748 Assembler::addpd(dst, as_Address(src));
749 } else {
750 lea(rscratch, src);
751 Assembler::addpd(dst, Address(rscratch, 0));
752 }
753 }
754
755 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only.
756 // Stub code is generated once and never copied.
757 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
758 void MacroAssembler::align64() {
759 align(64, (uint)(uintptr_t)pc());
760 }
761
762 void MacroAssembler::align32() {
763 align(32, (uint)(uintptr_t)pc());
764 }
765
766 void MacroAssembler::align(uint modulus) {
767 // 8273459: Ensure alignment is possible with current segment alignment
768 assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
769 align(modulus, offset());
770 }
771
772 void MacroAssembler::align(uint modulus, uint target) {
773 if (target % modulus != 0) {
774 nop(modulus - (target % modulus));
775 }
776 }
777
778 void MacroAssembler::push_f(XMMRegister r) {
779 subptr(rsp, wordSize);
780 movflt(Address(rsp, 0), r);
781 }
782
783 void MacroAssembler::pop_f(XMMRegister r) {
784 movflt(r, Address(rsp, 0));
785 addptr(rsp, wordSize);
786 }
787
788 void MacroAssembler::push_d(XMMRegister r) {
789 subptr(rsp, 2 * wordSize);
790 movdbl(Address(rsp, 0), r);
791 }
792
793 void MacroAssembler::pop_d(XMMRegister r) {
794 movdbl(r, Address(rsp, 0));
795 addptr(rsp, 2 * Interpreter::stackElementSize);
796 }
797
798 void MacroAssembler::push_ppx(Register src) {
799 if (VM_Version::supports_apx_f()) {
800 pushp(src);
801 } else {
802 Assembler::push(src);
803 }
804 }
805
806 void MacroAssembler::pop_ppx(Register dst) {
807 if (VM_Version::supports_apx_f()) {
808 popp(dst);
809 } else {
810 Assembler::pop(dst);
811 }
812 }
813
814 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
815 // Used in sign-masking with aligned address.
816 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
817 assert(rscratch != noreg || always_reachable(src), "missing");
818
819 if (UseAVX > 2 &&
820 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
821 (dst->encoding() >= 16)) {
822 vpand(dst, dst, src, AVX_512bit, rscratch);
823 } else if (reachable(src)) {
824 Assembler::andpd(dst, as_Address(src));
825 } else {
826 lea(rscratch, src);
827 Assembler::andpd(dst, Address(rscratch, 0));
828 }
829 }
830
831 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
832 // Used in sign-masking with aligned address.
833 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
834 assert(rscratch != noreg || always_reachable(src), "missing");
835
836 if (reachable(src)) {
837 Assembler::andps(dst, as_Address(src));
838 } else {
839 lea(rscratch, src);
840 Assembler::andps(dst, Address(rscratch, 0));
841 }
842 }
843
844 void MacroAssembler::andptr(Register dst, int32_t imm32) {
845 andq(dst, imm32);
846 }
847
848 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
849 assert(rscratch != noreg || always_reachable(src), "missing");
850
851 if (reachable(src)) {
852 andq(dst, as_Address(src));
853 } else {
854 lea(rscratch, src);
855 andq(dst, Address(rscratch, 0));
856 }
857 }
858
859 void MacroAssembler::atomic_incl(Address counter_addr) {
860 lock();
861 incrementl(counter_addr);
862 }
863
864 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
865 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
866
867 if (reachable(counter_addr)) {
868 atomic_incl(as_Address(counter_addr));
869 } else {
870 lea(rscratch, counter_addr);
871 atomic_incl(Address(rscratch, 0));
872 }
873 }
874
875 void MacroAssembler::atomic_incq(Address counter_addr) {
876 lock();
877 incrementq(counter_addr);
878 }
879
880 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
881 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
882
883 if (reachable(counter_addr)) {
884 atomic_incq(as_Address(counter_addr));
885 } else {
886 lea(rscratch, counter_addr);
887 atomic_incq(Address(rscratch, 0));
888 }
889 }
890
891 // Writes to stack successive pages until offset reached to check for
892 // stack overflow + shadow pages. This clobbers tmp.
893 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
894 movptr(tmp, rsp);
895 // Bang stack for total size given plus shadow page size.
896 // Bang one page at a time because large size can bang beyond yellow and
897 // red zones.
898 Label loop;
899 bind(loop);
900 movl(Address(tmp, (-(int)os::vm_page_size())), size );
901 subptr(tmp, (int)os::vm_page_size());
902 subl(size, (int)os::vm_page_size());
903 jcc(Assembler::greater, loop);
904
905 // Bang down shadow pages too.
906 // At this point, (tmp-0) is the last address touched, so don't
907 // touch it again. (It was touched as (tmp-pagesize) but then tmp
908 // was post-decremented.) Skip this address by starting at i=1, and
909 // touch a few more pages below. N.B. It is important to touch all
910 // the way down including all pages in the shadow zone.
911 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
912 // this could be any sized move but this is can be a debugging crumb
913 // so the bigger the better.
914 movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
915 }
916 }
917
918 void MacroAssembler::reserved_stack_check() {
919 // testing if reserved zone needs to be enabled
920 Label no_reserved_zone_enabling;
921
922 cmpptr(rsp, Address(r15_thread, JavaThread::reserved_stack_activation_offset()));
923 jcc(Assembler::below, no_reserved_zone_enabling);
924
925 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), r15_thread);
926 jump(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
927 should_not_reach_here();
928
929 bind(no_reserved_zone_enabling);
930 }
931
932 void MacroAssembler::c2bool(Register x) {
933 // implements x == 0 ? 0 : 1
934 // note: must only look at least-significant byte of x
935 // since C-style booleans are stored in one byte
936 // only! (was bug)
937 andl(x, 0xFF);
938 setb(Assembler::notZero, x);
939 }
940
941 // Wouldn't need if AddressLiteral version had new name
942 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
943 Assembler::call(L, rtype);
944 }
945
946 void MacroAssembler::call(Register entry) {
947 Assembler::call(entry);
948 }
949
950 void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
951 assert(rscratch != noreg || always_reachable(entry), "missing");
952
953 if (reachable(entry)) {
954 Assembler::call_literal(entry.target(), entry.rspec());
955 } else {
956 lea(rscratch, entry);
957 Assembler::call(rscratch);
958 }
959 }
960
961 void MacroAssembler::ic_call(address entry, jint method_index) {
962 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
963 // Needs full 64-bit immediate for later patching.
964 Assembler::mov64(rax, (int64_t)Universe::non_oop_word());
965 call(AddressLiteral(entry, rh));
966 }
967
968 int MacroAssembler::ic_check_size() {
969 return UseCompactObjectHeaders ? 17 : 14;
970 }
971
972 int MacroAssembler::ic_check(int end_alignment) {
973 Register receiver = j_rarg0;
974 Register data = rax;
975 Register temp = rscratch1;
976
977 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
978 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
979 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
980 // before the inline cache check here, and not after
981 align(end_alignment, offset() + ic_check_size());
982
983 int uep_offset = offset();
984
985 if (UseCompactObjectHeaders) {
986 load_narrow_klass_compact(temp, receiver);
987 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
988 } else {
989 movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
990 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
991 }
992
993 // if inline cache check fails, then jump to runtime routine
994 jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
995 assert((offset() % end_alignment) == 0, "Misaligned verified entry point (%d, %d, %d)", uep_offset, offset(), end_alignment);
996
997 return uep_offset;
998 }
999
1000 void MacroAssembler::emit_static_call_stub() {
1001 // Static stub relocation also tags the Method* in the code-stream.
1002 mov_metadata(rbx, (Metadata*) nullptr); // Method is zapped till fixup time.
1003 // This is recognized as unresolved by relocs/nativeinst/ic code.
1004 jump(RuntimeAddress(pc()));
1005 }
1006
1007 // Implementation of call_VM versions
1008
1009 void MacroAssembler::call_VM(Register oop_result,
1010 address entry_point,
1011 bool check_exceptions) {
1012 Label C, E;
1013 call(C, relocInfo::none);
1014 jmp(E);
1015
1016 bind(C);
1017 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1018 ret(0);
1019
1020 bind(E);
1021 }
1022
1023 void MacroAssembler::call_VM(Register oop_result,
1024 address entry_point,
1025 Register arg_1,
1026 bool check_exceptions) {
1027 Label C, E;
1028 call(C, relocInfo::none);
1029 jmp(E);
1030
1031 bind(C);
1032 pass_arg1(this, arg_1);
1033 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1034 ret(0);
1035
1036 bind(E);
1037 }
1038
1039 void MacroAssembler::call_VM(Register oop_result,
1040 address entry_point,
1041 Register arg_1,
1042 Register arg_2,
1043 bool check_exceptions) {
1044 Label C, E;
1045 call(C, relocInfo::none);
1046 jmp(E);
1047
1048 bind(C);
1049
1050 assert_different_registers(arg_1, c_rarg2);
1051
1052 pass_arg2(this, arg_2);
1053 pass_arg1(this, arg_1);
1054 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1055 ret(0);
1056
1057 bind(E);
1058 }
1059
1060 void MacroAssembler::call_VM(Register oop_result,
1061 address entry_point,
1062 Register arg_1,
1063 Register arg_2,
1064 Register arg_3,
1065 bool check_exceptions) {
1066 Label C, E;
1067 call(C, relocInfo::none);
1068 jmp(E);
1069
1070 bind(C);
1071
1072 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1073 assert_different_registers(arg_2, c_rarg3);
1074 pass_arg3(this, arg_3);
1075 pass_arg2(this, arg_2);
1076 pass_arg1(this, arg_1);
1077 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1078 ret(0);
1079
1080 bind(E);
1081 }
1082
1083 void MacroAssembler::call_VM(Register oop_result,
1084 Register last_java_sp,
1085 address entry_point,
1086 int number_of_arguments,
1087 bool check_exceptions) {
1088 call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1089 }
1090
1091 void MacroAssembler::call_VM(Register oop_result,
1092 Register last_java_sp,
1093 address entry_point,
1094 Register arg_1,
1095 bool check_exceptions) {
1096 pass_arg1(this, arg_1);
1097 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1098 }
1099
1100 void MacroAssembler::call_VM(Register oop_result,
1101 Register last_java_sp,
1102 address entry_point,
1103 Register arg_1,
1104 Register arg_2,
1105 bool check_exceptions) {
1106
1107 assert_different_registers(arg_1, c_rarg2);
1108 pass_arg2(this, arg_2);
1109 pass_arg1(this, arg_1);
1110 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1111 }
1112
1113 void MacroAssembler::call_VM(Register oop_result,
1114 Register last_java_sp,
1115 address entry_point,
1116 Register arg_1,
1117 Register arg_2,
1118 Register arg_3,
1119 bool check_exceptions) {
1120 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1121 assert_different_registers(arg_2, c_rarg3);
1122 pass_arg3(this, arg_3);
1123 pass_arg2(this, arg_2);
1124 pass_arg1(this, arg_1);
1125 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1126 }
1127
1128 void MacroAssembler::super_call_VM(Register oop_result,
1129 Register last_java_sp,
1130 address entry_point,
1131 int number_of_arguments,
1132 bool check_exceptions) {
1133 MacroAssembler::call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1134 }
1135
1136 void MacroAssembler::super_call_VM(Register oop_result,
1137 Register last_java_sp,
1138 address entry_point,
1139 Register arg_1,
1140 bool check_exceptions) {
1141 pass_arg1(this, arg_1);
1142 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1143 }
1144
1145 void MacroAssembler::super_call_VM(Register oop_result,
1146 Register last_java_sp,
1147 address entry_point,
1148 Register arg_1,
1149 Register arg_2,
1150 bool check_exceptions) {
1151
1152 assert_different_registers(arg_1, c_rarg2);
1153 pass_arg2(this, arg_2);
1154 pass_arg1(this, arg_1);
1155 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1156 }
1157
1158 void MacroAssembler::super_call_VM(Register oop_result,
1159 Register last_java_sp,
1160 address entry_point,
1161 Register arg_1,
1162 Register arg_2,
1163 Register arg_3,
1164 bool check_exceptions) {
1165 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1166 assert_different_registers(arg_2, c_rarg3);
1167 pass_arg3(this, arg_3);
1168 pass_arg2(this, arg_2);
1169 pass_arg1(this, arg_1);
1170 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1171 }
1172
1173 void MacroAssembler::call_VM_base(Register oop_result,
1174 Register last_java_sp,
1175 address entry_point,
1176 int number_of_arguments,
1177 bool check_exceptions) {
1178 Register java_thread = r15_thread;
1179
1180 // determine last_java_sp register
1181 if (!last_java_sp->is_valid()) {
1182 last_java_sp = rsp;
1183 }
1184 // debugging support
1185 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1186 #ifdef ASSERT
1187 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1188 // r12 is the heapbase.
1189 if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
1190 #endif // ASSERT
1191
1192 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1193 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1194
1195 // push java thread (becomes first argument of C function)
1196
1197 mov(c_rarg0, r15_thread);
1198
1199 // set last Java frame before call
1200 assert(last_java_sp != rbp, "can't use ebp/rbp");
1201
1202 // Only interpreter should have to set fp
1203 set_last_Java_frame(last_java_sp, rbp, nullptr, rscratch1);
1204
1205 // do the call, remove parameters
1206 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1207
1208 #ifdef ASSERT
1209 // Check that thread register is not clobbered.
1210 guarantee(java_thread != rax, "change this code");
1211 push(rax);
1212 { Label L;
1213 get_thread_slow(rax);
1214 cmpptr(java_thread, rax);
1215 jcc(Assembler::equal, L);
1216 STOP("MacroAssembler::call_VM_base: java_thread not callee saved?");
1217 bind(L);
1218 }
1219 pop(rax);
1220 #endif
1221
1222 // reset last Java frame
1223 // Only interpreter should have to clear fp
1224 reset_last_Java_frame(true);
1225
1226 // C++ interp handles this in the interpreter
1227 check_and_handle_popframe();
1228 check_and_handle_earlyret();
1229
1230 if (check_exceptions) {
1231 // check for pending exceptions (java_thread is set upon return)
1232 cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1233 // This used to conditionally jump to forward_exception however it is
1234 // possible if we relocate that the branch will not reach. So we must jump
1235 // around so we can always reach
1236
1237 Label ok;
1238 jcc(Assembler::equal, ok);
1239 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1240 bind(ok);
1241 }
1242
1243 // get oop result if there is one and reset the value in the thread
1244 if (oop_result->is_valid()) {
1245 get_vm_result_oop(oop_result);
1246 }
1247 }
1248
1249 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1250 // Calculate the value for last_Java_sp somewhat subtle.
1251 // call_VM does an intermediate call which places a return address on
1252 // the stack just under the stack pointer as the user finished with it.
1253 // This allows use to retrieve last_Java_pc from last_Java_sp[-1].
1254
1255 // We've pushed one address, correct last_Java_sp
1256 lea(rax, Address(rsp, wordSize));
1257
1258 call_VM_base(oop_result, rax, entry_point, number_of_arguments, check_exceptions);
1259 }
1260
1261 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1262 void MacroAssembler::call_VM_leaf0(address entry_point) {
1263 MacroAssembler::call_VM_leaf_base(entry_point, 0);
1264 }
1265
1266 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1267 call_VM_leaf_base(entry_point, number_of_arguments);
1268 }
1269
1270 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1271 pass_arg0(this, arg_0);
1272 call_VM_leaf(entry_point, 1);
1273 }
1274
1275 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1276
1277 assert_different_registers(arg_0, c_rarg1);
1278 pass_arg1(this, arg_1);
1279 pass_arg0(this, arg_0);
1280 call_VM_leaf(entry_point, 2);
1281 }
1282
1283 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1284 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1285 assert_different_registers(arg_1, c_rarg2);
1286 pass_arg2(this, arg_2);
1287 pass_arg1(this, arg_1);
1288 pass_arg0(this, arg_0);
1289 call_VM_leaf(entry_point, 3);
1290 }
1291
1292 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1293 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1294 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1295 assert_different_registers(arg_2, c_rarg3);
1296 pass_arg3(this, arg_3);
1297 pass_arg2(this, arg_2);
1298 pass_arg1(this, arg_1);
1299 pass_arg0(this, arg_0);
1300 call_VM_leaf(entry_point, 3);
1301 }
1302
1303 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1304 pass_arg0(this, arg_0);
1305 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1306 }
1307
1308 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1309 assert_different_registers(arg_0, c_rarg1);
1310 pass_arg1(this, arg_1);
1311 pass_arg0(this, arg_0);
1312 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1313 }
1314
1315 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1316 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1317 assert_different_registers(arg_1, c_rarg2);
1318 pass_arg2(this, arg_2);
1319 pass_arg1(this, arg_1);
1320 pass_arg0(this, arg_0);
1321 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1322 }
1323
1324 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1325 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1326 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1327 assert_different_registers(arg_2, c_rarg3);
1328 pass_arg3(this, arg_3);
1329 pass_arg2(this, arg_2);
1330 pass_arg1(this, arg_1);
1331 pass_arg0(this, arg_0);
1332 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1333 }
1334
1335 void MacroAssembler::get_vm_result_oop(Register oop_result) {
1336 movptr(oop_result, Address(r15_thread, JavaThread::vm_result_oop_offset()));
1337 movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
1338 verify_oop_msg(oop_result, "broken oop in call_VM_base");
1339 }
1340
1341 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
1342 movptr(metadata_result, Address(r15_thread, JavaThread::vm_result_metadata_offset()));
1343 movptr(Address(r15_thread, JavaThread::vm_result_metadata_offset()), NULL_WORD);
1344 }
1345
1346 void MacroAssembler::check_and_handle_earlyret() {
1347 }
1348
1349 void MacroAssembler::check_and_handle_popframe() {
1350 }
1351
1352 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1353 assert(rscratch != noreg || always_reachable(src1), "missing");
1354
1355 if (reachable(src1)) {
1356 cmpl(as_Address(src1), imm);
1357 } else {
1358 lea(rscratch, src1);
1359 cmpl(Address(rscratch, 0), imm);
1360 }
1361 }
1362
1363 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1364 assert(!src2.is_lval(), "use cmpptr");
1365 assert(rscratch != noreg || always_reachable(src2), "missing");
1366
1367 if (reachable(src2)) {
1368 cmpl(src1, as_Address(src2));
1369 } else {
1370 lea(rscratch, src2);
1371 cmpl(src1, Address(rscratch, 0));
1372 }
1373 }
1374
1375 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1376 Assembler::cmpl(src1, imm);
1377 }
1378
1379 void MacroAssembler::cmp32(Register src1, Address src2) {
1380 Assembler::cmpl(src1, src2);
1381 }
1382
1383 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1384 ucomisd(opr1, opr2);
1385
1386 Label L;
1387 if (unordered_is_less) {
1388 movl(dst, -1);
1389 jcc(Assembler::parity, L);
1390 jcc(Assembler::below , L);
1391 movl(dst, 0);
1392 jcc(Assembler::equal , L);
1393 increment(dst);
1394 } else { // unordered is greater
1395 movl(dst, 1);
1396 jcc(Assembler::parity, L);
1397 jcc(Assembler::above , L);
1398 movl(dst, 0);
1399 jcc(Assembler::equal , L);
1400 decrementl(dst);
1401 }
1402 bind(L);
1403 }
1404
1405 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1406 ucomiss(opr1, opr2);
1407
1408 Label L;
1409 if (unordered_is_less) {
1410 movl(dst, -1);
1411 jcc(Assembler::parity, L);
1412 jcc(Assembler::below , L);
1413 movl(dst, 0);
1414 jcc(Assembler::equal , L);
1415 increment(dst);
1416 } else { // unordered is greater
1417 movl(dst, 1);
1418 jcc(Assembler::parity, L);
1419 jcc(Assembler::above , L);
1420 movl(dst, 0);
1421 jcc(Assembler::equal , L);
1422 decrementl(dst);
1423 }
1424 bind(L);
1425 }
1426
1427
1428 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1429 assert(rscratch != noreg || always_reachable(src1), "missing");
1430
1431 if (reachable(src1)) {
1432 cmpb(as_Address(src1), imm);
1433 } else {
1434 lea(rscratch, src1);
1435 cmpb(Address(rscratch, 0), imm);
1436 }
1437 }
1438
1439 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1440 assert(rscratch != noreg || always_reachable(src2), "missing");
1441
1442 if (src2.is_lval()) {
1443 movptr(rscratch, src2);
1444 Assembler::cmpq(src1, rscratch);
1445 } else if (reachable(src2)) {
1446 cmpq(src1, as_Address(src2));
1447 } else {
1448 lea(rscratch, src2);
1449 Assembler::cmpq(src1, Address(rscratch, 0));
1450 }
1451 }
1452
1453 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1454 assert(src2.is_lval(), "not a mem-mem compare");
1455 // moves src2's literal address
1456 movptr(rscratch, src2);
1457 Assembler::cmpq(src1, rscratch);
1458 }
1459
1460 void MacroAssembler::cmpoop(Register src1, Register src2) {
1461 cmpptr(src1, src2);
1462 }
1463
1464 void MacroAssembler::cmpoop(Register src1, Address src2) {
1465 cmpptr(src1, src2);
1466 }
1467
1468 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1469 movoop(rscratch, src2);
1470 cmpptr(src1, rscratch);
1471 }
1472
1473 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1474 assert(rscratch != noreg || always_reachable(adr), "missing");
1475
1476 if (reachable(adr)) {
1477 lock();
1478 cmpxchgptr(reg, as_Address(adr));
1479 } else {
1480 lea(rscratch, adr);
1481 lock();
1482 cmpxchgptr(reg, Address(rscratch, 0));
1483 }
1484 }
1485
1486 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1487 cmpxchgq(reg, adr);
1488 }
1489
1490 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1491 assert(rscratch != noreg || always_reachable(src), "missing");
1492
1493 if (reachable(src)) {
1494 Assembler::comisd(dst, as_Address(src));
1495 } else {
1496 lea(rscratch, src);
1497 Assembler::comisd(dst, Address(rscratch, 0));
1498 }
1499 }
1500
1501 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1502 assert(rscratch != noreg || always_reachable(src), "missing");
1503
1504 if (reachable(src)) {
1505 Assembler::comiss(dst, as_Address(src));
1506 } else {
1507 lea(rscratch, src);
1508 Assembler::comiss(dst, Address(rscratch, 0));
1509 }
1510 }
1511
1512
1513 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1514 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1515
1516 Condition negated_cond = negate_condition(cond);
1517 Label L;
1518 jcc(negated_cond, L);
1519 pushf(); // Preserve flags
1520 atomic_incl(counter_addr, rscratch);
1521 popf();
1522 bind(L);
1523 }
1524
1525 int MacroAssembler::corrected_idivl(Register reg) {
1526 // Full implementation of Java idiv and irem; checks for
1527 // special case as described in JVM spec., p.243 & p.271.
1528 // The function returns the (pc) offset of the idivl
1529 // instruction - may be needed for implicit exceptions.
1530 //
1531 // normal case special case
1532 //
1533 // input : rax,: dividend min_int
1534 // reg: divisor (may not be rax,/rdx) -1
1535 //
1536 // output: rax,: quotient (= rax, idiv reg) min_int
1537 // rdx: remainder (= rax, irem reg) 0
1538 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1539 const int min_int = 0x80000000;
1540 Label normal_case, special_case;
1541
1542 // check for special case
1543 cmpl(rax, min_int);
1544 jcc(Assembler::notEqual, normal_case);
1545 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1546 cmpl(reg, -1);
1547 jcc(Assembler::equal, special_case);
1548
1549 // handle normal case
1550 bind(normal_case);
1551 cdql();
1552 int idivl_offset = offset();
1553 idivl(reg);
1554
1555 // normal and special case exit
1556 bind(special_case);
1557
1558 return idivl_offset;
1559 }
1560
1561
1562
1563 void MacroAssembler::decrementl(Register reg, int value) {
1564 if (value == min_jint) {subl(reg, value) ; return; }
1565 if (value < 0) { incrementl(reg, -value); return; }
1566 if (value == 0) { ; return; }
1567 if (value == 1 && UseIncDec) { decl(reg) ; return; }
1568 /* else */ { subl(reg, value) ; return; }
1569 }
1570
1571 void MacroAssembler::decrementl(Address dst, int value) {
1572 if (value == min_jint) {subl(dst, value) ; return; }
1573 if (value < 0) { incrementl(dst, -value); return; }
1574 if (value == 0) { ; return; }
1575 if (value == 1 && UseIncDec) { decl(dst) ; return; }
1576 /* else */ { subl(dst, value) ; return; }
1577 }
1578
1579 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1580 assert(shift_value > 0, "illegal shift value");
1581 Label _is_positive;
1582 testl (reg, reg);
1583 jcc (Assembler::positive, _is_positive);
1584 int offset = (1 << shift_value) - 1 ;
1585
1586 if (offset == 1) {
1587 incrementl(reg);
1588 } else {
1589 addl(reg, offset);
1590 }
1591
1592 bind (_is_positive);
1593 sarl(reg, shift_value);
1594 }
1595
1596 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1597 assert(rscratch != noreg || always_reachable(src), "missing");
1598
1599 if (reachable(src)) {
1600 Assembler::divsd(dst, as_Address(src));
1601 } else {
1602 lea(rscratch, src);
1603 Assembler::divsd(dst, Address(rscratch, 0));
1604 }
1605 }
1606
1607 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1608 assert(rscratch != noreg || always_reachable(src), "missing");
1609
1610 if (reachable(src)) {
1611 Assembler::divss(dst, as_Address(src));
1612 } else {
1613 lea(rscratch, src);
1614 Assembler::divss(dst, Address(rscratch, 0));
1615 }
1616 }
1617
1618 void MacroAssembler::enter() {
1619 push(rbp);
1620 mov(rbp, rsp);
1621 }
1622
1623 void MacroAssembler::post_call_nop() {
1624 if (!Continuations::enabled()) {
1625 return;
1626 }
1627 InstructionMark im(this);
1628 relocate(post_call_nop_Relocation::spec());
1629 InlineSkippedInstructionsCounter skipCounter(this);
1630 emit_int8((uint8_t)0x0f);
1631 emit_int8((uint8_t)0x1f);
1632 emit_int8((uint8_t)0x84);
1633 emit_int8((uint8_t)0x00);
1634 emit_int32(0x00);
1635 }
1636
1637 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1638 assert(rscratch != noreg || always_reachable(src), "missing");
1639 if (reachable(src)) {
1640 Assembler::mulpd(dst, as_Address(src));
1641 } else {
1642 lea(rscratch, src);
1643 Assembler::mulpd(dst, Address(rscratch, 0));
1644 }
1645 }
1646
1647 // dst = c = a * b + c
1648 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1649 Assembler::vfmadd231sd(c, a, b);
1650 if (dst != c) {
1651 movdbl(dst, c);
1652 }
1653 }
1654
1655 // dst = c = a * b + c
1656 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1657 Assembler::vfmadd231ss(c, a, b);
1658 if (dst != c) {
1659 movflt(dst, c);
1660 }
1661 }
1662
1663 // dst = c = a * b + c
1664 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1665 Assembler::vfmadd231pd(c, a, b, vector_len);
1666 if (dst != c) {
1667 vmovdqu(dst, c);
1668 }
1669 }
1670
1671 // dst = c = a * b + c
1672 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1673 Assembler::vfmadd231ps(c, a, b, vector_len);
1674 if (dst != c) {
1675 vmovdqu(dst, c);
1676 }
1677 }
1678
1679 // dst = c = a * b + c
1680 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1681 Assembler::vfmadd231pd(c, a, b, vector_len);
1682 if (dst != c) {
1683 vmovdqu(dst, c);
1684 }
1685 }
1686
1687 // dst = c = a * b + c
1688 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1689 Assembler::vfmadd231ps(c, a, b, vector_len);
1690 if (dst != c) {
1691 vmovdqu(dst, c);
1692 }
1693 }
1694
1695 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
1696 assert(rscratch != noreg || always_reachable(dst), "missing");
1697
1698 if (reachable(dst)) {
1699 incrementl(as_Address(dst));
1700 } else {
1701 lea(rscratch, dst);
1702 incrementl(Address(rscratch, 0));
1703 }
1704 }
1705
1706 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
1707 incrementl(as_Address(dst, rscratch));
1708 }
1709
1710 void MacroAssembler::incrementl(Register reg, int value) {
1711 if (value == min_jint) {addl(reg, value) ; return; }
1712 if (value < 0) { decrementl(reg, -value); return; }
1713 if (value == 0) { ; return; }
1714 if (value == 1 && UseIncDec) { incl(reg) ; return; }
1715 /* else */ { addl(reg, value) ; return; }
1716 }
1717
1718 void MacroAssembler::incrementl(Address dst, int value) {
1719 if (value == min_jint) {addl(dst, value) ; return; }
1720 if (value < 0) { decrementl(dst, -value); return; }
1721 if (value == 0) { ; return; }
1722 if (value == 1 && UseIncDec) { incl(dst) ; return; }
1723 /* else */ { addl(dst, value) ; return; }
1724 }
1725
1726 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
1727 assert(rscratch != noreg || always_reachable(dst), "missing");
1728 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump");
1729 if (reachable(dst)) {
1730 jmp_literal(dst.target(), dst.rspec());
1731 } else {
1732 lea(rscratch, dst);
1733 jmp(rscratch);
1734 }
1735 }
1736
1737 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
1738 assert(rscratch != noreg || always_reachable(dst), "missing");
1739 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump_cc");
1740 if (reachable(dst)) {
1741 InstructionMark im(this);
1742 relocate(dst.reloc());
1743 const int short_size = 2;
1744 const int long_size = 6;
1745 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
1746 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
1747 // 0111 tttn #8-bit disp
1748 emit_int8(0x70 | cc);
1749 emit_int8((offs - short_size) & 0xFF);
1750 } else {
1751 // 0000 1111 1000 tttn #32-bit disp
1752 emit_int8(0x0F);
1753 emit_int8((unsigned char)(0x80 | cc));
1754 emit_int32(offs - long_size);
1755 }
1756 } else {
1757 #ifdef ASSERT
1758 warning("reversing conditional branch");
1759 #endif /* ASSERT */
1760 Label skip;
1761 jccb(reverse[cc], skip);
1762 lea(rscratch, dst);
1763 Assembler::jmp(rscratch);
1764 bind(skip);
1765 }
1766 }
1767
1768 void MacroAssembler::cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch) {
1769 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
1770 assert(rscratch != noreg || always_reachable(mxcsr_std), "missing");
1771
1772 stmxcsr(mxcsr_save);
1773 movl(tmp, mxcsr_save);
1774 if (EnableX86ECoreOpts) {
1775 // The mxcsr_std has status bits set for performance on ECore
1776 orl(tmp, 0x003f);
1777 } else {
1778 // Mask out status bits (only check control and mask bits)
1779 andl(tmp, 0xFFC0);
1780 }
1781 cmp32(tmp, mxcsr_std, rscratch);
1782 }
1783
1784 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
1785 assert(rscratch != noreg || always_reachable(src), "missing");
1786
1787 if (reachable(src)) {
1788 Assembler::ldmxcsr(as_Address(src));
1789 } else {
1790 lea(rscratch, src);
1791 Assembler::ldmxcsr(Address(rscratch, 0));
1792 }
1793 }
1794
1795 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1796 int off = offset();
1797 movsbl(dst, src); // movsxb
1798 return off;
1799 }
1800
1801 // Note: load_signed_short used to be called load_signed_word.
1802 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
1803 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
1804 // The term "word" in HotSpot means a 32- or 64-bit machine word.
1805 int MacroAssembler::load_signed_short(Register dst, Address src) {
1806 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
1807 // version but this is what 64bit has always done. This seems to imply
1808 // that users are only using 32bits worth.
1809 int off = offset();
1810 movswl(dst, src); // movsxw
1811 return off;
1812 }
1813
1814 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1815 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1816 // and "3.9 Partial Register Penalties", p. 22).
1817 int off = offset();
1818 movzbl(dst, src); // movzxb
1819 return off;
1820 }
1821
1822 // Note: load_unsigned_short used to be called load_unsigned_word.
1823 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1824 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1825 // and "3.9 Partial Register Penalties", p. 22).
1826 int off = offset();
1827 movzwl(dst, src); // movzxw
1828 return off;
1829 }
1830
1831 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1832 switch (size_in_bytes) {
1833 case 8: movq(dst, src); break;
1834 case 4: movl(dst, src); break;
1835 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1836 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1837 default: ShouldNotReachHere();
1838 }
1839 }
1840
1841 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1842 switch (size_in_bytes) {
1843 case 8: movq(dst, src); break;
1844 case 4: movl(dst, src); break;
1845 case 2: movw(dst, src); break;
1846 case 1: movb(dst, src); break;
1847 default: ShouldNotReachHere();
1848 }
1849 }
1850
1851 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
1852 assert(rscratch != noreg || always_reachable(dst), "missing");
1853
1854 if (reachable(dst)) {
1855 movl(as_Address(dst), src);
1856 } else {
1857 lea(rscratch, dst);
1858 movl(Address(rscratch, 0), src);
1859 }
1860 }
1861
1862 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
1863 if (reachable(src)) {
1864 movl(dst, as_Address(src));
1865 } else {
1866 lea(dst, src);
1867 movl(dst, Address(dst, 0));
1868 }
1869 }
1870
1871 // C++ bool manipulation
1872
1873 void MacroAssembler::movbool(Register dst, Address src) {
1874 if(sizeof(bool) == 1)
1875 movb(dst, src);
1876 else if(sizeof(bool) == 2)
1877 movw(dst, src);
1878 else if(sizeof(bool) == 4)
1879 movl(dst, src);
1880 else
1881 // unsupported
1882 ShouldNotReachHere();
1883 }
1884
1885 void MacroAssembler::movbool(Address dst, bool boolconst) {
1886 if(sizeof(bool) == 1)
1887 movb(dst, (int) boolconst);
1888 else if(sizeof(bool) == 2)
1889 movw(dst, (int) boolconst);
1890 else if(sizeof(bool) == 4)
1891 movl(dst, (int) boolconst);
1892 else
1893 // unsupported
1894 ShouldNotReachHere();
1895 }
1896
1897 void MacroAssembler::movbool(Address dst, Register src) {
1898 if(sizeof(bool) == 1)
1899 movb(dst, src);
1900 else if(sizeof(bool) == 2)
1901 movw(dst, src);
1902 else if(sizeof(bool) == 4)
1903 movl(dst, src);
1904 else
1905 // unsupported
1906 ShouldNotReachHere();
1907 }
1908
1909 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1910 assert(rscratch != noreg || always_reachable(src), "missing");
1911
1912 if (reachable(src)) {
1913 movdl(dst, as_Address(src));
1914 } else {
1915 lea(rscratch, src);
1916 movdl(dst, Address(rscratch, 0));
1917 }
1918 }
1919
1920 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
1921 assert(rscratch != noreg || always_reachable(src), "missing");
1922
1923 if (reachable(src)) {
1924 movq(dst, as_Address(src));
1925 } else {
1926 lea(rscratch, src);
1927 movq(dst, Address(rscratch, 0));
1928 }
1929 }
1930
1931 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1932 assert(rscratch != noreg || always_reachable(src), "missing");
1933
1934 if (reachable(src)) {
1935 if (UseXmmLoadAndClearUpper) {
1936 movsd (dst, as_Address(src));
1937 } else {
1938 movlpd(dst, as_Address(src));
1939 }
1940 } else {
1941 lea(rscratch, src);
1942 if (UseXmmLoadAndClearUpper) {
1943 movsd (dst, Address(rscratch, 0));
1944 } else {
1945 movlpd(dst, Address(rscratch, 0));
1946 }
1947 }
1948 }
1949
1950 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
1951 assert(rscratch != noreg || always_reachable(src), "missing");
1952
1953 if (reachable(src)) {
1954 movss(dst, as_Address(src));
1955 } else {
1956 lea(rscratch, src);
1957 movss(dst, Address(rscratch, 0));
1958 }
1959 }
1960
1961 void MacroAssembler::movhlf(XMMRegister dst, XMMRegister src, Register rscratch) {
1962 if (VM_Version::supports_avx10_2()) {
1963 evmovw(dst, src);
1964 } else {
1965 assert(rscratch != noreg, "missing");
1966 evmovw(rscratch, src);
1967 evmovw(dst, rscratch);
1968 }
1969 }
1970
1971 void MacroAssembler::mov64(Register dst, int64_t imm64) {
1972 if (is_uimm32(imm64)) {
1973 movl(dst, checked_cast<uint32_t>(imm64));
1974 } else if (is_simm32(imm64)) {
1975 movq(dst, checked_cast<int32_t>(imm64));
1976 } else {
1977 Assembler::mov64(dst, imm64);
1978 }
1979 }
1980
1981 void MacroAssembler::mov64(Register dst, int64_t imm64, relocInfo::relocType rtype, int format) {
1982 Assembler::mov64(dst, imm64, rtype, format);
1983 }
1984
1985 void MacroAssembler::movptr(Register dst, Register src) {
1986 movq(dst, src);
1987 }
1988
1989 void MacroAssembler::movptr(Register dst, Address src) {
1990 movq(dst, src);
1991 }
1992
1993 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
1994 void MacroAssembler::movptr(Register dst, intptr_t src) {
1995 mov64(dst, src);
1996 }
1997
1998 void MacroAssembler::movptr(Address dst, Register src) {
1999 movq(dst, src);
2000 }
2001
2002 void MacroAssembler::movptr(Address dst, int32_t src) {
2003 movslq(dst, src);
2004 }
2005
2006 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2007 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2008 Assembler::movdqu(dst, src);
2009 }
2010
2011 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2012 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2013 Assembler::movdqu(dst, src);
2014 }
2015
2016 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2017 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2018 Assembler::movdqu(dst, src);
2019 }
2020
2021 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2022 assert(rscratch != noreg || always_reachable(src), "missing");
2023
2024 if (reachable(src)) {
2025 movdqu(dst, as_Address(src));
2026 } else {
2027 lea(rscratch, src);
2028 movdqu(dst, Address(rscratch, 0));
2029 }
2030 }
2031
2032 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2033 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2034 Assembler::vmovdqu(dst, src);
2035 }
2036
2037 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2038 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2039 Assembler::vmovdqu(dst, src);
2040 }
2041
2042 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2043 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2044 Assembler::vmovdqu(dst, src);
2045 }
2046
2047 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2048 assert(rscratch != noreg || always_reachable(src), "missing");
2049
2050 if (reachable(src)) {
2051 vmovdqu(dst, as_Address(src));
2052 }
2053 else {
2054 lea(rscratch, src);
2055 vmovdqu(dst, Address(rscratch, 0));
2056 }
2057 }
2058
2059 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2060 assert(rscratch != noreg || always_reachable(src), "missing");
2061
2062 if (vector_len == AVX_512bit) {
2063 evmovdquq(dst, src, AVX_512bit, rscratch);
2064 } else if (vector_len == AVX_256bit) {
2065 vmovdqu(dst, src, rscratch);
2066 } else {
2067 movdqu(dst, src, rscratch);
2068 }
2069 }
2070
2071 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
2072 if (vector_len == AVX_512bit) {
2073 evmovdquq(dst, src, AVX_512bit);
2074 } else if (vector_len == AVX_256bit) {
2075 vmovdqu(dst, src);
2076 } else {
2077 movdqu(dst, src);
2078 }
2079 }
2080
2081 void MacroAssembler::vmovdqu(Address dst, XMMRegister src, int vector_len) {
2082 if (vector_len == AVX_512bit) {
2083 evmovdquq(dst, src, AVX_512bit);
2084 } else if (vector_len == AVX_256bit) {
2085 vmovdqu(dst, src);
2086 } else {
2087 movdqu(dst, src);
2088 }
2089 }
2090
2091 void MacroAssembler::vmovdqu(XMMRegister dst, Address src, int vector_len) {
2092 if (vector_len == AVX_512bit) {
2093 evmovdquq(dst, src, AVX_512bit);
2094 } else if (vector_len == AVX_256bit) {
2095 vmovdqu(dst, src);
2096 } else {
2097 movdqu(dst, src);
2098 }
2099 }
2100
2101 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2102 assert(rscratch != noreg || always_reachable(src), "missing");
2103
2104 if (reachable(src)) {
2105 vmovdqa(dst, as_Address(src));
2106 }
2107 else {
2108 lea(rscratch, src);
2109 vmovdqa(dst, Address(rscratch, 0));
2110 }
2111 }
2112
2113 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2114 assert(rscratch != noreg || always_reachable(src), "missing");
2115
2116 if (vector_len == AVX_512bit) {
2117 evmovdqaq(dst, src, AVX_512bit, rscratch);
2118 } else if (vector_len == AVX_256bit) {
2119 vmovdqa(dst, src, rscratch);
2120 } else {
2121 movdqa(dst, src, rscratch);
2122 }
2123 }
2124
2125 void MacroAssembler::kmov(KRegister dst, Address src) {
2126 if (VM_Version::supports_avx512bw()) {
2127 kmovql(dst, src);
2128 } else {
2129 assert(VM_Version::supports_evex(), "");
2130 kmovwl(dst, src);
2131 }
2132 }
2133
2134 void MacroAssembler::kmov(Address dst, KRegister src) {
2135 if (VM_Version::supports_avx512bw()) {
2136 kmovql(dst, src);
2137 } else {
2138 assert(VM_Version::supports_evex(), "");
2139 kmovwl(dst, src);
2140 }
2141 }
2142
2143 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2144 if (VM_Version::supports_avx512bw()) {
2145 kmovql(dst, src);
2146 } else {
2147 assert(VM_Version::supports_evex(), "");
2148 kmovwl(dst, src);
2149 }
2150 }
2151
2152 void MacroAssembler::kmov(Register dst, KRegister src) {
2153 if (VM_Version::supports_avx512bw()) {
2154 kmovql(dst, src);
2155 } else {
2156 assert(VM_Version::supports_evex(), "");
2157 kmovwl(dst, src);
2158 }
2159 }
2160
2161 void MacroAssembler::kmov(KRegister dst, Register src) {
2162 if (VM_Version::supports_avx512bw()) {
2163 kmovql(dst, src);
2164 } else {
2165 assert(VM_Version::supports_evex(), "");
2166 kmovwl(dst, src);
2167 }
2168 }
2169
2170 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2171 assert(rscratch != noreg || always_reachable(src), "missing");
2172
2173 if (reachable(src)) {
2174 kmovql(dst, as_Address(src));
2175 } else {
2176 lea(rscratch, src);
2177 kmovql(dst, Address(rscratch, 0));
2178 }
2179 }
2180
2181 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2182 assert(rscratch != noreg || always_reachable(src), "missing");
2183
2184 if (reachable(src)) {
2185 kmovwl(dst, as_Address(src));
2186 } else {
2187 lea(rscratch, src);
2188 kmovwl(dst, Address(rscratch, 0));
2189 }
2190 }
2191
2192 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2193 int vector_len, Register rscratch) {
2194 assert(rscratch != noreg || always_reachable(src), "missing");
2195
2196 if (reachable(src)) {
2197 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2198 } else {
2199 lea(rscratch, src);
2200 Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2201 }
2202 }
2203
2204 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2205 int vector_len, Register rscratch) {
2206 assert(rscratch != noreg || always_reachable(src), "missing");
2207
2208 if (reachable(src)) {
2209 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2210 } else {
2211 lea(rscratch, src);
2212 Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2213 }
2214 }
2215
2216 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2217 assert(rscratch != noreg || always_reachable(src), "missing");
2218
2219 if (reachable(src)) {
2220 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2221 } else {
2222 lea(rscratch, src);
2223 Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2224 }
2225 }
2226
2227 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2228 assert(rscratch != noreg || always_reachable(src), "missing");
2229
2230 if (reachable(src)) {
2231 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2232 } else {
2233 lea(rscratch, src);
2234 Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2235 }
2236 }
2237
2238 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2239 assert(rscratch != noreg || always_reachable(src), "missing");
2240
2241 if (reachable(src)) {
2242 Assembler::evmovdquq(dst, as_Address(src), vector_len);
2243 } else {
2244 lea(rscratch, src);
2245 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2246 }
2247 }
2248
2249 void MacroAssembler::evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2250 assert(rscratch != noreg || always_reachable(src), "missing");
2251
2252 if (reachable(src)) {
2253 Assembler::evmovdqaq(dst, mask, as_Address(src), merge, vector_len);
2254 } else {
2255 lea(rscratch, src);
2256 Assembler::evmovdqaq(dst, mask, Address(rscratch, 0), merge, vector_len);
2257 }
2258 }
2259
2260 void MacroAssembler::evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2261 assert(rscratch != noreg || always_reachable(src), "missing");
2262
2263 if (reachable(src)) {
2264 Assembler::evmovdqaq(dst, as_Address(src), vector_len);
2265 } else {
2266 lea(rscratch, src);
2267 Assembler::evmovdqaq(dst, Address(rscratch, 0), vector_len);
2268 }
2269 }
2270
2271 void MacroAssembler::movapd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2272 assert(rscratch != noreg || always_reachable(src), "missing");
2273
2274 if (reachable(src)) {
2275 Assembler::movapd(dst, as_Address(src));
2276 } else {
2277 lea(rscratch, src);
2278 Assembler::movapd(dst, Address(rscratch, 0));
2279 }
2280 }
2281
2282 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2283 assert(rscratch != noreg || always_reachable(src), "missing");
2284
2285 if (reachable(src)) {
2286 Assembler::movdqa(dst, as_Address(src));
2287 } else {
2288 lea(rscratch, src);
2289 Assembler::movdqa(dst, Address(rscratch, 0));
2290 }
2291 }
2292
2293 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2294 assert(rscratch != noreg || always_reachable(src), "missing");
2295
2296 if (reachable(src)) {
2297 Assembler::movsd(dst, as_Address(src));
2298 } else {
2299 lea(rscratch, src);
2300 Assembler::movsd(dst, Address(rscratch, 0));
2301 }
2302 }
2303
2304 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2305 assert(rscratch != noreg || always_reachable(src), "missing");
2306
2307 if (reachable(src)) {
2308 Assembler::movss(dst, as_Address(src));
2309 } else {
2310 lea(rscratch, src);
2311 Assembler::movss(dst, Address(rscratch, 0));
2312 }
2313 }
2314
2315 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2316 assert(rscratch != noreg || always_reachable(src), "missing");
2317
2318 if (reachable(src)) {
2319 Assembler::movddup(dst, as_Address(src));
2320 } else {
2321 lea(rscratch, src);
2322 Assembler::movddup(dst, Address(rscratch, 0));
2323 }
2324 }
2325
2326 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2327 assert(rscratch != noreg || always_reachable(src), "missing");
2328
2329 if (reachable(src)) {
2330 Assembler::vmovddup(dst, as_Address(src), vector_len);
2331 } else {
2332 lea(rscratch, src);
2333 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2334 }
2335 }
2336
2337 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2338 assert(rscratch != noreg || always_reachable(src), "missing");
2339
2340 if (reachable(src)) {
2341 Assembler::mulsd(dst, as_Address(src));
2342 } else {
2343 lea(rscratch, src);
2344 Assembler::mulsd(dst, Address(rscratch, 0));
2345 }
2346 }
2347
2348 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2349 assert(rscratch != noreg || always_reachable(src), "missing");
2350
2351 if (reachable(src)) {
2352 Assembler::mulss(dst, as_Address(src));
2353 } else {
2354 lea(rscratch, src);
2355 Assembler::mulss(dst, Address(rscratch, 0));
2356 }
2357 }
2358
2359 void MacroAssembler::null_check(Register reg, int offset) {
2360 if (needs_explicit_null_check(offset)) {
2361 // provoke OS null exception if reg is null by
2362 // accessing M[reg] w/o changing any (non-CC) registers
2363 // NOTE: cmpl is plenty here to provoke a segv
2364 cmpptr(rax, Address(reg, 0));
2365 // Note: should probably use testl(rax, Address(reg, 0));
2366 // may be shorter code (however, this version of
2367 // testl needs to be implemented first)
2368 } else {
2369 // nothing to do, (later) access of M[reg + offset]
2370 // will provoke OS null exception if reg is null
2371 }
2372 }
2373
2374 void MacroAssembler::os_breakpoint() {
2375 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2376 // (e.g., MSVC can't call ps() otherwise)
2377 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2378 }
2379
2380 void MacroAssembler::unimplemented(const char* what) {
2381 const char* buf = nullptr;
2382 {
2383 ResourceMark rm;
2384 stringStream ss;
2385 ss.print("unimplemented: %s", what);
2386 buf = code_string(ss.as_string());
2387 }
2388 stop(buf);
2389 }
2390
2391 #define XSTATE_BV 0x200
2392
2393 void MacroAssembler::pop_CPU_state() {
2394 pop_FPU_state();
2395 pop_IU_state();
2396 }
2397
2398 void MacroAssembler::pop_FPU_state() {
2399 fxrstor(Address(rsp, 0));
2400 addptr(rsp, FPUStateSizeInWords * wordSize);
2401 }
2402
2403 void MacroAssembler::pop_IU_state() {
2404 popa();
2405 addq(rsp, 8);
2406 popf();
2407 }
2408
2409 // Save Integer and Float state
2410 // Warning: Stack must be 16 byte aligned (64bit)
2411 void MacroAssembler::push_CPU_state() {
2412 push_IU_state();
2413 push_FPU_state();
2414 }
2415
2416 void MacroAssembler::push_FPU_state() {
2417 subptr(rsp, FPUStateSizeInWords * wordSize);
2418 fxsave(Address(rsp, 0));
2419 }
2420
2421 void MacroAssembler::push_IU_state() {
2422 // Push flags first because pusha kills them
2423 pushf();
2424 // Make sure rsp stays 16-byte aligned
2425 subq(rsp, 8);
2426 pusha();
2427 }
2428
2429 void MacroAssembler::push_cont_fastpath() {
2430 if (!Continuations::enabled()) return;
2431
2432 Label L_done;
2433 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2434 jccb(Assembler::belowEqual, L_done);
2435 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rsp);
2436 bind(L_done);
2437 }
2438
2439 void MacroAssembler::pop_cont_fastpath() {
2440 if (!Continuations::enabled()) return;
2441
2442 Label L_done;
2443 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2444 jccb(Assembler::below, L_done);
2445 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
2446 bind(L_done);
2447 }
2448
2449 #ifdef ASSERT
2450 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
2451 Label no_cont;
2452 movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
2453 testl(cont, cont);
2454 jcc(Assembler::zero, no_cont);
2455 stop(name);
2456 bind(no_cont);
2457 }
2458 #endif
2459
2460 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { // determine java_thread register
2461 // we must set sp to zero to clear frame
2462 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2463 // must clear fp, so that compiled frames are not confused; it is
2464 // possible that we need it only for debugging
2465 if (clear_fp) {
2466 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2467 }
2468 // Always clear the pc because it could have been set by make_walkable()
2469 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2470 vzeroupper();
2471 }
2472
2473 void MacroAssembler::round_to(Register reg, int modulus) {
2474 addptr(reg, modulus - 1);
2475 andptr(reg, -modulus);
2476 }
2477
2478 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod) {
2479 if (at_return) {
2480 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2481 // we may safely use rsp instead to perform the stack watermark check.
2482 cmpptr(in_nmethod ? rsp : rbp, Address(r15_thread, JavaThread::polling_word_offset()));
2483 jcc(Assembler::above, slow_path);
2484 return;
2485 }
2486 testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2487 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2488 }
2489
2490 // Calls to C land
2491 //
2492 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2493 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2494 // has to be reset to 0. This is required to allow proper stack traversal.
2495 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2496 Register last_java_fp,
2497 address last_java_pc,
2498 Register rscratch) {
2499 vzeroupper();
2500 // determine last_java_sp register
2501 if (!last_java_sp->is_valid()) {
2502 last_java_sp = rsp;
2503 }
2504 // last_java_fp is optional
2505 if (last_java_fp->is_valid()) {
2506 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2507 }
2508 // last_java_pc is optional
2509 if (last_java_pc != nullptr) {
2510 Address java_pc(r15_thread,
2511 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
2512 lea(java_pc, InternalAddress(last_java_pc), rscratch);
2513 }
2514 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2515 }
2516
2517 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2518 Register last_java_fp,
2519 Label &L,
2520 Register scratch) {
2521 lea(scratch, L);
2522 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), scratch);
2523 set_last_Java_frame(last_java_sp, last_java_fp, nullptr, scratch);
2524 }
2525
2526 void MacroAssembler::shlptr(Register dst, int imm8) {
2527 shlq(dst, imm8);
2528 }
2529
2530 void MacroAssembler::shrptr(Register dst, int imm8) {
2531 shrq(dst, imm8);
2532 }
2533
2534 void MacroAssembler::sign_extend_byte(Register reg) {
2535 movsbl(reg, reg); // movsxb
2536 }
2537
2538 void MacroAssembler::sign_extend_short(Register reg) {
2539 movswl(reg, reg); // movsxw
2540 }
2541
2542 void MacroAssembler::testl(Address dst, int32_t imm32) {
2543 if (imm32 >= 0 && is8bit(imm32)) {
2544 testb(dst, imm32);
2545 } else {
2546 Assembler::testl(dst, imm32);
2547 }
2548 }
2549
2550 void MacroAssembler::testl(Register dst, int32_t imm32) {
2551 if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
2552 testb(dst, imm32);
2553 } else {
2554 Assembler::testl(dst, imm32);
2555 }
2556 }
2557
2558 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2559 assert(always_reachable(src), "Address should be reachable");
2560 testl(dst, as_Address(src));
2561 }
2562
2563 void MacroAssembler::testq(Address dst, int32_t imm32) {
2564 if (imm32 >= 0) {
2565 testl(dst, imm32);
2566 } else {
2567 Assembler::testq(dst, imm32);
2568 }
2569 }
2570
2571 void MacroAssembler::testq(Register dst, int32_t imm32) {
2572 if (imm32 >= 0) {
2573 testl(dst, imm32);
2574 } else {
2575 Assembler::testq(dst, imm32);
2576 }
2577 }
2578
2579 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2580 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2581 Assembler::pcmpeqb(dst, src);
2582 }
2583
2584 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2585 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2586 Assembler::pcmpeqw(dst, src);
2587 }
2588
2589 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2590 assert((dst->encoding() < 16),"XMM register should be 0-15");
2591 Assembler::pcmpestri(dst, src, imm8);
2592 }
2593
2594 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2595 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2596 Assembler::pcmpestri(dst, src, imm8);
2597 }
2598
2599 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2600 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2601 Assembler::pmovzxbw(dst, src);
2602 }
2603
2604 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2605 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2606 Assembler::pmovzxbw(dst, src);
2607 }
2608
2609 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2610 assert((src->encoding() < 16),"XMM register should be 0-15");
2611 Assembler::pmovmskb(dst, src);
2612 }
2613
2614 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2615 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2616 Assembler::ptest(dst, src);
2617 }
2618
2619 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2620 assert(rscratch != noreg || always_reachable(src), "missing");
2621
2622 if (reachable(src)) {
2623 Assembler::sqrtss(dst, as_Address(src));
2624 } else {
2625 lea(rscratch, src);
2626 Assembler::sqrtss(dst, Address(rscratch, 0));
2627 }
2628 }
2629
2630 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2631 assert(rscratch != noreg || always_reachable(src), "missing");
2632
2633 if (reachable(src)) {
2634 Assembler::subsd(dst, as_Address(src));
2635 } else {
2636 lea(rscratch, src);
2637 Assembler::subsd(dst, Address(rscratch, 0));
2638 }
2639 }
2640
2641 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
2642 assert(rscratch != noreg || always_reachable(src), "missing");
2643
2644 if (reachable(src)) {
2645 Assembler::roundsd(dst, as_Address(src), rmode);
2646 } else {
2647 lea(rscratch, src);
2648 Assembler::roundsd(dst, Address(rscratch, 0), rmode);
2649 }
2650 }
2651
2652 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2653 assert(rscratch != noreg || always_reachable(src), "missing");
2654
2655 if (reachable(src)) {
2656 Assembler::subss(dst, as_Address(src));
2657 } else {
2658 lea(rscratch, src);
2659 Assembler::subss(dst, Address(rscratch, 0));
2660 }
2661 }
2662
2663 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2664 assert(rscratch != noreg || always_reachable(src), "missing");
2665
2666 if (reachable(src)) {
2667 Assembler::ucomisd(dst, as_Address(src));
2668 } else {
2669 lea(rscratch, src);
2670 Assembler::ucomisd(dst, Address(rscratch, 0));
2671 }
2672 }
2673
2674 void MacroAssembler::evucomxsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2675 assert(rscratch != noreg || always_reachable(src), "missing");
2676
2677 if (reachable(src)) {
2678 Assembler::evucomxsd(dst, as_Address(src));
2679 } else {
2680 lea(rscratch, src);
2681 Assembler::evucomxsd(dst, Address(rscratch, 0));
2682 }
2683 }
2684
2685 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2686 assert(rscratch != noreg || always_reachable(src), "missing");
2687
2688 if (reachable(src)) {
2689 Assembler::ucomiss(dst, as_Address(src));
2690 } else {
2691 lea(rscratch, src);
2692 Assembler::ucomiss(dst, Address(rscratch, 0));
2693 }
2694 }
2695
2696 void MacroAssembler::evucomxss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2697 assert(rscratch != noreg || always_reachable(src), "missing");
2698
2699 if (reachable(src)) {
2700 Assembler::evucomxss(dst, as_Address(src));
2701 } else {
2702 lea(rscratch, src);
2703 Assembler::evucomxss(dst, Address(rscratch, 0));
2704 }
2705 }
2706
2707 void MacroAssembler::evucomish(XMMRegister dst, AddressLiteral src, Register rscratch) {
2708 assert(rscratch != noreg || always_reachable(src), "missing");
2709
2710 if (reachable(src)) {
2711 Assembler::evucomish(dst, as_Address(src));
2712 } else {
2713 lea(rscratch, src);
2714 Assembler::evucomish(dst, Address(rscratch, 0));
2715 }
2716 }
2717
2718 void MacroAssembler::evucomxsh(XMMRegister dst, AddressLiteral src, Register rscratch) {
2719 assert(rscratch != noreg || always_reachable(src), "missing");
2720
2721 if (reachable(src)) {
2722 Assembler::evucomxsh(dst, as_Address(src));
2723 } else {
2724 lea(rscratch, src);
2725 Assembler::evucomxsh(dst, Address(rscratch, 0));
2726 }
2727 }
2728
2729 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2730 assert(rscratch != noreg || always_reachable(src), "missing");
2731
2732 // Used in sign-bit flipping with aligned address.
2733 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2734
2735 if (UseAVX > 2 &&
2736 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2737 (dst->encoding() >= 16)) {
2738 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2739 } else if (reachable(src)) {
2740 Assembler::xorpd(dst, as_Address(src));
2741 } else {
2742 lea(rscratch, src);
2743 Assembler::xorpd(dst, Address(rscratch, 0));
2744 }
2745 }
2746
2747 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
2748 if (UseAVX > 2 &&
2749 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2750 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2751 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2752 } else {
2753 Assembler::xorpd(dst, src);
2754 }
2755 }
2756
2757 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
2758 if (UseAVX > 2 &&
2759 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2760 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2761 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2762 } else {
2763 Assembler::xorps(dst, src);
2764 }
2765 }
2766
2767 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
2768 assert(rscratch != noreg || always_reachable(src), "missing");
2769
2770 // Used in sign-bit flipping with aligned address.
2771 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2772
2773 if (UseAVX > 2 &&
2774 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2775 (dst->encoding() >= 16)) {
2776 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2777 } else if (reachable(src)) {
2778 Assembler::xorps(dst, as_Address(src));
2779 } else {
2780 lea(rscratch, src);
2781 Assembler::xorps(dst, Address(rscratch, 0));
2782 }
2783 }
2784
2785 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
2786 assert(rscratch != noreg || always_reachable(src), "missing");
2787
2788 // Used in sign-bit flipping with aligned address.
2789 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
2790 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
2791 if (reachable(src)) {
2792 Assembler::pshufb(dst, as_Address(src));
2793 } else {
2794 lea(rscratch, src);
2795 Assembler::pshufb(dst, Address(rscratch, 0));
2796 }
2797 }
2798
2799 // AVX 3-operands instructions
2800
2801 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2802 assert(rscratch != noreg || always_reachable(src), "missing");
2803
2804 if (reachable(src)) {
2805 vaddsd(dst, nds, as_Address(src));
2806 } else {
2807 lea(rscratch, src);
2808 vaddsd(dst, nds, Address(rscratch, 0));
2809 }
2810 }
2811
2812 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2813 assert(rscratch != noreg || always_reachable(src), "missing");
2814
2815 if (reachable(src)) {
2816 vaddss(dst, nds, as_Address(src));
2817 } else {
2818 lea(rscratch, src);
2819 vaddss(dst, nds, Address(rscratch, 0));
2820 }
2821 }
2822
2823 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2824 assert(UseAVX > 0, "requires some form of AVX");
2825 assert(rscratch != noreg || always_reachable(src), "missing");
2826
2827 if (reachable(src)) {
2828 Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
2829 } else {
2830 lea(rscratch, src);
2831 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
2832 }
2833 }
2834
2835 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2836 assert(UseAVX > 0, "requires some form of AVX");
2837 assert(rscratch != noreg || always_reachable(src), "missing");
2838
2839 if (reachable(src)) {
2840 Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
2841 } else {
2842 lea(rscratch, src);
2843 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
2844 }
2845 }
2846
2847 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2848 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2849 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2850
2851 vandps(dst, nds, negate_field, vector_len, rscratch);
2852 }
2853
2854 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2855 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2856 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2857
2858 vandpd(dst, nds, negate_field, vector_len, rscratch);
2859 }
2860
2861 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2862 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2863 Assembler::vpaddb(dst, nds, src, vector_len);
2864 }
2865
2866 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2867 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2868 Assembler::vpaddb(dst, nds, src, vector_len);
2869 }
2870
2871 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2872 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2873 Assembler::vpaddw(dst, nds, src, vector_len);
2874 }
2875
2876 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2877 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2878 Assembler::vpaddw(dst, nds, src, vector_len);
2879 }
2880
2881 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2882 assert(rscratch != noreg || always_reachable(src), "missing");
2883
2884 if (reachable(src)) {
2885 Assembler::vpand(dst, nds, as_Address(src), vector_len);
2886 } else {
2887 lea(rscratch, src);
2888 Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
2889 }
2890 }
2891
2892 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2893 assert(rscratch != noreg || always_reachable(src), "missing");
2894
2895 if (reachable(src)) {
2896 Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
2897 } else {
2898 lea(rscratch, src);
2899 Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
2900 }
2901 }
2902
2903 void MacroAssembler::vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2904 assert(rscratch != noreg || always_reachable(src), "missing");
2905
2906 if (reachable(src)) {
2907 Assembler::vbroadcasti128(dst, as_Address(src), vector_len);
2908 } else {
2909 lea(rscratch, src);
2910 Assembler::vbroadcasti128(dst, Address(rscratch, 0), vector_len);
2911 }
2912 }
2913
2914 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2915 assert(rscratch != noreg || always_reachable(src), "missing");
2916
2917 if (reachable(src)) {
2918 Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
2919 } else {
2920 lea(rscratch, src);
2921 Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
2922 }
2923 }
2924
2925 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2926 assert(rscratch != noreg || always_reachable(src), "missing");
2927
2928 if (reachable(src)) {
2929 Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
2930 } else {
2931 lea(rscratch, src);
2932 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
2933 }
2934 }
2935
2936 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2937 assert(rscratch != noreg || always_reachable(src), "missing");
2938
2939 if (reachable(src)) {
2940 Assembler::vbroadcastss(dst, as_Address(src), vector_len);
2941 } else {
2942 lea(rscratch, src);
2943 Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
2944 }
2945 }
2946
2947 // Vector float blend
2948 // vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
2949 void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
2950 // WARN: Allow dst == (src1|src2), mask == scratch
2951 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
2952 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
2953 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst;
2954 bool dst_available = dst != mask && (dst != src1 || dst != src2);
2955 if (blend_emulation && scratch_available && dst_available) {
2956 if (compute_mask) {
2957 vpsrad(scratch, mask, 32, vector_len);
2958 mask = scratch;
2959 }
2960 if (dst == src1) {
2961 vpandn(dst, mask, src1, vector_len); // if mask == 0, src1
2962 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
2963 } else {
2964 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
2965 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1
2966 }
2967 vpor(dst, dst, scratch, vector_len);
2968 } else {
2969 Assembler::vblendvps(dst, src1, src2, mask, vector_len);
2970 }
2971 }
2972
2973 // vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
2974 void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
2975 // WARN: Allow dst == (src1|src2), mask == scratch
2976 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
2977 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
2978 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask);
2979 bool dst_available = dst != mask && (dst != src1 || dst != src2);
2980 if (blend_emulation && scratch_available && dst_available) {
2981 if (compute_mask) {
2982 vpxor(scratch, scratch, scratch, vector_len);
2983 vpcmpgtq(scratch, scratch, mask, vector_len);
2984 mask = scratch;
2985 }
2986 if (dst == src1) {
2987 vpandn(dst, mask, src1, vector_len); // if mask == 0, src
2988 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
2989 } else {
2990 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
2991 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src
2992 }
2993 vpor(dst, dst, scratch, vector_len);
2994 } else {
2995 Assembler::vblendvpd(dst, src1, src2, mask, vector_len);
2996 }
2997 }
2998
2999 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3000 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3001 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3002 }
3003
3004 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
3005 assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3006 Assembler::vpcmpeqb(dst, src1, src2, vector_len);
3007 }
3008
3009 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3010 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3011 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3012 }
3013
3014 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3015 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3016 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3017 }
3018
3019 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3020 assert(rscratch != noreg || always_reachable(src), "missing");
3021
3022 if (reachable(src)) {
3023 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3024 } else {
3025 lea(rscratch, src);
3026 Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3027 }
3028 }
3029
3030 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3031 int comparison, bool is_signed, int vector_len, Register rscratch) {
3032 assert(rscratch != noreg || always_reachable(src), "missing");
3033
3034 if (reachable(src)) {
3035 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3036 } else {
3037 lea(rscratch, src);
3038 Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3039 }
3040 }
3041
3042 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3043 int comparison, bool is_signed, int vector_len, Register rscratch) {
3044 assert(rscratch != noreg || always_reachable(src), "missing");
3045
3046 if (reachable(src)) {
3047 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3048 } else {
3049 lea(rscratch, src);
3050 Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3051 }
3052 }
3053
3054 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3055 int comparison, bool is_signed, int vector_len, Register rscratch) {
3056 assert(rscratch != noreg || always_reachable(src), "missing");
3057
3058 if (reachable(src)) {
3059 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3060 } else {
3061 lea(rscratch, src);
3062 Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3063 }
3064 }
3065
3066 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3067 int comparison, bool is_signed, int vector_len, Register rscratch) {
3068 assert(rscratch != noreg || always_reachable(src), "missing");
3069
3070 if (reachable(src)) {
3071 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3072 } else {
3073 lea(rscratch, src);
3074 Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3075 }
3076 }
3077
3078 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3079 if (width == Assembler::Q) {
3080 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3081 } else {
3082 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3083 }
3084 }
3085
3086 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3087 int eq_cond_enc = 0x29;
3088 int gt_cond_enc = 0x37;
3089 if (width != Assembler::Q) {
3090 eq_cond_enc = 0x74 + width;
3091 gt_cond_enc = 0x64 + width;
3092 }
3093 switch (cond) {
3094 case eq:
3095 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3096 break;
3097 case neq:
3098 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3099 vallones(xtmp, vector_len);
3100 vpxor(dst, xtmp, dst, vector_len);
3101 break;
3102 case le:
3103 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3104 vallones(xtmp, vector_len);
3105 vpxor(dst, xtmp, dst, vector_len);
3106 break;
3107 case nlt:
3108 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3109 vallones(xtmp, vector_len);
3110 vpxor(dst, xtmp, dst, vector_len);
3111 break;
3112 case lt:
3113 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3114 break;
3115 case nle:
3116 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3117 break;
3118 default:
3119 assert(false, "Should not reach here");
3120 }
3121 }
3122
3123 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3124 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3125 Assembler::vpmovzxbw(dst, src, vector_len);
3126 }
3127
3128 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3129 assert((src->encoding() < 16),"XMM register should be 0-15");
3130 Assembler::vpmovmskb(dst, src, vector_len);
3131 }
3132
3133 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3134 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3135 Assembler::vpmullw(dst, nds, src, vector_len);
3136 }
3137
3138 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3139 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3140 Assembler::vpmullw(dst, nds, src, vector_len);
3141 }
3142
3143 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3144 assert((UseAVX > 0), "AVX support is needed");
3145 assert(rscratch != noreg || always_reachable(src), "missing");
3146
3147 if (reachable(src)) {
3148 Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3149 } else {
3150 lea(rscratch, src);
3151 Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3152 }
3153 }
3154
3155 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3156 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3157 Assembler::vpsubb(dst, nds, src, vector_len);
3158 }
3159
3160 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3161 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3162 Assembler::vpsubb(dst, nds, src, vector_len);
3163 }
3164
3165 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3166 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3167 Assembler::vpsubw(dst, nds, src, vector_len);
3168 }
3169
3170 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3171 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3172 Assembler::vpsubw(dst, nds, src, vector_len);
3173 }
3174
3175 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3176 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3177 Assembler::vpsraw(dst, nds, shift, vector_len);
3178 }
3179
3180 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3181 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3182 Assembler::vpsraw(dst, nds, shift, vector_len);
3183 }
3184
3185 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3186 assert(UseAVX > 2,"");
3187 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3188 vector_len = 2;
3189 }
3190 Assembler::evpsraq(dst, nds, shift, vector_len);
3191 }
3192
3193 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3194 assert(UseAVX > 2,"");
3195 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3196 vector_len = 2;
3197 }
3198 Assembler::evpsraq(dst, nds, shift, vector_len);
3199 }
3200
3201 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3202 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3203 Assembler::vpsrlw(dst, nds, shift, vector_len);
3204 }
3205
3206 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3207 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3208 Assembler::vpsrlw(dst, nds, shift, vector_len);
3209 }
3210
3211 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3212 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3213 Assembler::vpsllw(dst, nds, shift, vector_len);
3214 }
3215
3216 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3217 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3218 Assembler::vpsllw(dst, nds, shift, vector_len);
3219 }
3220
3221 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3222 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3223 Assembler::vptest(dst, src);
3224 }
3225
3226 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3227 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3228 Assembler::punpcklbw(dst, src);
3229 }
3230
3231 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3232 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3233 Assembler::pshufd(dst, src, mode);
3234 }
3235
3236 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3237 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3238 Assembler::pshuflw(dst, src, mode);
3239 }
3240
3241 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3242 assert(rscratch != noreg || always_reachable(src), "missing");
3243
3244 if (reachable(src)) {
3245 vandpd(dst, nds, as_Address(src), vector_len);
3246 } else {
3247 lea(rscratch, src);
3248 vandpd(dst, nds, Address(rscratch, 0), vector_len);
3249 }
3250 }
3251
3252 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3253 assert(rscratch != noreg || always_reachable(src), "missing");
3254
3255 if (reachable(src)) {
3256 vandps(dst, nds, as_Address(src), vector_len);
3257 } else {
3258 lea(rscratch, src);
3259 vandps(dst, nds, Address(rscratch, 0), vector_len);
3260 }
3261 }
3262
3263 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3264 bool merge, int vector_len, Register rscratch) {
3265 assert(rscratch != noreg || always_reachable(src), "missing");
3266
3267 if (reachable(src)) {
3268 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3269 } else {
3270 lea(rscratch, src);
3271 Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3272 }
3273 }
3274
3275 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3276 assert(rscratch != noreg || always_reachable(src), "missing");
3277
3278 if (reachable(src)) {
3279 vdivsd(dst, nds, as_Address(src));
3280 } else {
3281 lea(rscratch, src);
3282 vdivsd(dst, nds, Address(rscratch, 0));
3283 }
3284 }
3285
3286 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3287 assert(rscratch != noreg || always_reachable(src), "missing");
3288
3289 if (reachable(src)) {
3290 vdivss(dst, nds, as_Address(src));
3291 } else {
3292 lea(rscratch, src);
3293 vdivss(dst, nds, Address(rscratch, 0));
3294 }
3295 }
3296
3297 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3298 assert(rscratch != noreg || always_reachable(src), "missing");
3299
3300 if (reachable(src)) {
3301 vmulsd(dst, nds, as_Address(src));
3302 } else {
3303 lea(rscratch, src);
3304 vmulsd(dst, nds, Address(rscratch, 0));
3305 }
3306 }
3307
3308 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3309 assert(rscratch != noreg || always_reachable(src), "missing");
3310
3311 if (reachable(src)) {
3312 vmulss(dst, nds, as_Address(src));
3313 } else {
3314 lea(rscratch, src);
3315 vmulss(dst, nds, Address(rscratch, 0));
3316 }
3317 }
3318
3319 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3320 assert(rscratch != noreg || always_reachable(src), "missing");
3321
3322 if (reachable(src)) {
3323 vsubsd(dst, nds, as_Address(src));
3324 } else {
3325 lea(rscratch, src);
3326 vsubsd(dst, nds, Address(rscratch, 0));
3327 }
3328 }
3329
3330 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3331 assert(rscratch != noreg || always_reachable(src), "missing");
3332
3333 if (reachable(src)) {
3334 vsubss(dst, nds, as_Address(src));
3335 } else {
3336 lea(rscratch, src);
3337 vsubss(dst, nds, Address(rscratch, 0));
3338 }
3339 }
3340
3341 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3342 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3343 assert(rscratch != noreg || always_reachable(src), "missing");
3344
3345 vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3346 }
3347
3348 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3349 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3350 assert(rscratch != noreg || always_reachable(src), "missing");
3351
3352 vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3353 }
3354
3355 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3356 assert(rscratch != noreg || always_reachable(src), "missing");
3357
3358 if (reachable(src)) {
3359 vxorpd(dst, nds, as_Address(src), vector_len);
3360 } else {
3361 lea(rscratch, src);
3362 vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3363 }
3364 }
3365
3366 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3367 assert(rscratch != noreg || always_reachable(src), "missing");
3368
3369 if (reachable(src)) {
3370 vxorps(dst, nds, as_Address(src), vector_len);
3371 } else {
3372 lea(rscratch, src);
3373 vxorps(dst, nds, Address(rscratch, 0), vector_len);
3374 }
3375 }
3376
3377 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3378 assert(rscratch != noreg || always_reachable(src), "missing");
3379
3380 if (UseAVX > 1 || (vector_len < 1)) {
3381 if (reachable(src)) {
3382 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3383 } else {
3384 lea(rscratch, src);
3385 Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3386 }
3387 } else {
3388 MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3389 }
3390 }
3391
3392 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3393 assert(rscratch != noreg || always_reachable(src), "missing");
3394
3395 if (reachable(src)) {
3396 Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3397 } else {
3398 lea(rscratch, src);
3399 Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3400 }
3401 }
3402
3403 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3404 const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3405 STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3406 // The inverted mask is sign-extended
3407 andptr(possibly_non_local, inverted_mask);
3408 }
3409
3410 void MacroAssembler::resolve_jobject(Register value,
3411 Register tmp) {
3412 Register thread = r15_thread;
3413 assert_different_registers(value, thread, tmp);
3414 Label done, tagged, weak_tagged;
3415 testptr(value, value);
3416 jcc(Assembler::zero, done); // Use null as-is.
3417 testptr(value, JNIHandles::tag_mask); // Test for tag.
3418 jcc(Assembler::notZero, tagged);
3419
3420 // Resolve local handle
3421 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp);
3422 verify_oop(value);
3423 jmp(done);
3424
3425 bind(tagged);
3426 testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3427 jcc(Assembler::notZero, weak_tagged);
3428
3429 // Resolve global handle
3430 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3431 verify_oop(value);
3432 jmp(done);
3433
3434 bind(weak_tagged);
3435 // Resolve jweak.
3436 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3437 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp);
3438 verify_oop(value);
3439
3440 bind(done);
3441 }
3442
3443 void MacroAssembler::resolve_global_jobject(Register value,
3444 Register tmp) {
3445 Register thread = r15_thread;
3446 assert_different_registers(value, thread, tmp);
3447 Label done;
3448
3449 testptr(value, value);
3450 jcc(Assembler::zero, done); // Use null as-is.
3451
3452 #ifdef ASSERT
3453 {
3454 Label valid_global_tag;
3455 testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
3456 jcc(Assembler::notZero, valid_global_tag);
3457 stop("non global jobject using resolve_global_jobject");
3458 bind(valid_global_tag);
3459 }
3460 #endif
3461
3462 // Resolve global handle
3463 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3464 verify_oop(value);
3465
3466 bind(done);
3467 }
3468
3469 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3470 subq(dst, imm32);
3471 }
3472
3473 // Force generation of a 4 byte immediate value even if it fits into 8bit
3474 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3475 subq_imm32(dst, imm32);
3476 }
3477
3478 void MacroAssembler::subptr(Register dst, Register src) {
3479 subq(dst, src);
3480 }
3481
3482 // C++ bool manipulation
3483 void MacroAssembler::testbool(Register dst) {
3484 if(sizeof(bool) == 1)
3485 testb(dst, 0xff);
3486 else if(sizeof(bool) == 2) {
3487 // testw implementation needed for two byte bools
3488 ShouldNotReachHere();
3489 } else if(sizeof(bool) == 4)
3490 testl(dst, dst);
3491 else
3492 // unsupported
3493 ShouldNotReachHere();
3494 }
3495
3496 void MacroAssembler::testptr(Register dst, Register src) {
3497 testq(dst, src);
3498 }
3499
3500 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3501 void MacroAssembler::tlab_allocate(Register obj,
3502 Register var_size_in_bytes,
3503 int con_size_in_bytes,
3504 Register t1,
3505 Register t2,
3506 Label& slow_case) {
3507 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3508 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3509 }
3510
3511 RegSet MacroAssembler::call_clobbered_gp_registers() {
3512 RegSet regs;
3513 regs += RegSet::of(rax, rcx, rdx);
3514 #ifndef _WINDOWS
3515 regs += RegSet::of(rsi, rdi);
3516 #endif
3517 regs += RegSet::range(r8, r11);
3518 if (UseAPX) {
3519 regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1));
3520 }
3521 return regs;
3522 }
3523
3524 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
3525 int num_xmm_registers = XMMRegister::available_xmm_registers();
3526 #if defined(_WINDOWS)
3527 XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
3528 if (num_xmm_registers > 16) {
3529 result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
3530 }
3531 return result;
3532 #else
3533 return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
3534 #endif
3535 }
3536
3537 // C1 only ever uses the first double/float of the XMM register.
3538 static int xmm_save_size() { return sizeof(double); }
3539
3540 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3541 masm->movdbl(Address(rsp, offset), reg);
3542 }
3543
3544 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3545 masm->movdbl(reg, Address(rsp, offset));
3546 }
3547
3548 static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers,
3549 bool save_fpu, int& gp_area_size, int& xmm_area_size) {
3550
3551 gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
3552 StackAlignmentInBytes);
3553 xmm_area_size = save_fpu ? xmm_registers.size() * xmm_save_size() : 0;
3554
3555 return gp_area_size + xmm_area_size;
3556 }
3557
3558 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
3559 block_comment("push_call_clobbered_registers start");
3560 // Regular registers
3561 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
3562
3563 int gp_area_size;
3564 int xmm_area_size;
3565 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
3566 gp_area_size, xmm_area_size);
3567 subptr(rsp, total_save_size);
3568
3569 push_set(gp_registers_to_push, 0);
3570
3571 if (save_fpu) {
3572 push_set(call_clobbered_xmm_registers(), gp_area_size);
3573 }
3574
3575 block_comment("push_call_clobbered_registers end");
3576 }
3577
3578 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
3579 block_comment("pop_call_clobbered_registers start");
3580
3581 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
3582
3583 int gp_area_size;
3584 int xmm_area_size;
3585 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
3586 gp_area_size, xmm_area_size);
3587
3588 if (restore_fpu) {
3589 pop_set(call_clobbered_xmm_registers(), gp_area_size);
3590 }
3591
3592 pop_set(gp_registers_to_pop, 0);
3593
3594 addptr(rsp, total_save_size);
3595
3596 vzeroupper();
3597
3598 block_comment("pop_call_clobbered_registers end");
3599 }
3600
3601 void MacroAssembler::push_set(XMMRegSet set, int offset) {
3602 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
3603 int spill_offset = offset;
3604
3605 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
3606 save_xmm_register(this, spill_offset, *it);
3607 spill_offset += xmm_save_size();
3608 }
3609 }
3610
3611 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
3612 int restore_size = set.size() * xmm_save_size();
3613 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
3614
3615 int restore_offset = offset + restore_size - xmm_save_size();
3616
3617 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
3618 restore_xmm_register(this, restore_offset, *it);
3619 restore_offset -= xmm_save_size();
3620 }
3621 }
3622
3623 void MacroAssembler::push_set(RegSet set, int offset) {
3624 int spill_offset;
3625 if (offset == -1) {
3626 int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3627 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
3628 subptr(rsp, aligned_size);
3629 spill_offset = 0;
3630 } else {
3631 spill_offset = offset;
3632 }
3633
3634 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
3635 movptr(Address(rsp, spill_offset), *it);
3636 spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3637 }
3638 }
3639
3640 void MacroAssembler::pop_set(RegSet set, int offset) {
3641
3642 int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3643 int restore_size = set.size() * gp_reg_size;
3644 int aligned_size = align_up(restore_size, StackAlignmentInBytes);
3645
3646 int restore_offset;
3647 if (offset == -1) {
3648 restore_offset = restore_size - gp_reg_size;
3649 } else {
3650 restore_offset = offset + restore_size - gp_reg_size;
3651 }
3652 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
3653 movptr(*it, Address(rsp, restore_offset));
3654 restore_offset -= gp_reg_size;
3655 }
3656
3657 if (offset == -1) {
3658 addptr(rsp, aligned_size);
3659 }
3660 }
3661
3662 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3663 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3664 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3665 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3666 Label done;
3667
3668 testptr(length_in_bytes, length_in_bytes);
3669 jcc(Assembler::zero, done);
3670
3671 // initialize topmost word, divide index by 2, check if odd and test if zero
3672 // note: for the remaining code to work, index must be a multiple of BytesPerWord
3673 #ifdef ASSERT
3674 {
3675 Label L;
3676 testptr(length_in_bytes, BytesPerWord - 1);
3677 jcc(Assembler::zero, L);
3678 stop("length must be a multiple of BytesPerWord");
3679 bind(L);
3680 }
3681 #endif
3682 Register index = length_in_bytes;
3683 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
3684 if (UseIncDec) {
3685 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
3686 } else {
3687 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
3688 shrptr(index, 1);
3689 }
3690
3691 // initialize remaining object fields: index is a multiple of 2 now
3692 {
3693 Label loop;
3694 bind(loop);
3695 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3696 decrement(index);
3697 jcc(Assembler::notZero, loop);
3698 }
3699
3700 bind(done);
3701 }
3702
3703 // Look up the method for a megamorphic invokeinterface call.
3704 // The target method is determined by <intf_klass, itable_index>.
3705 // The receiver klass is in recv_klass.
3706 // On success, the result will be in method_result, and execution falls through.
3707 // On failure, execution transfers to the given label.
3708 void MacroAssembler::lookup_interface_method(Register recv_klass,
3709 Register intf_klass,
3710 RegisterOrConstant itable_index,
3711 Register method_result,
3712 Register scan_temp,
3713 Label& L_no_such_interface,
3714 bool return_method) {
3715 assert_different_registers(recv_klass, intf_klass, scan_temp);
3716 assert_different_registers(method_result, intf_klass, scan_temp);
3717 assert(recv_klass != method_result || !return_method,
3718 "recv_klass can be destroyed when method isn't needed");
3719
3720 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3721 "caller must use same register for non-constant itable index as for method");
3722
3723 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3724 int vtable_base = in_bytes(Klass::vtable_start_offset());
3725 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3726 int scan_step = itableOffsetEntry::size() * wordSize;
3727 int vte_size = vtableEntry::size_in_bytes();
3728 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3729 assert(vte_size == wordSize, "else adjust times_vte_scale");
3730
3731 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3732
3733 // Could store the aligned, prescaled offset in the klass.
3734 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3735
3736 if (return_method) {
3737 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3738 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3739 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3740 }
3741
3742 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
3743 // if (scan->interface() == intf) {
3744 // result = (klass + scan->offset() + itable_index);
3745 // }
3746 // }
3747 Label search, found_method;
3748
3749 for (int peel = 1; peel >= 0; peel--) {
3750 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
3751 cmpptr(intf_klass, method_result);
3752
3753 if (peel) {
3754 jccb(Assembler::equal, found_method);
3755 } else {
3756 jccb(Assembler::notEqual, search);
3757 // (invert the test to fall through to found_method...)
3758 }
3759
3760 if (!peel) break;
3761
3762 bind(search);
3763
3764 // Check that the previous entry is non-null. A null entry means that
3765 // the receiver class doesn't implement the interface, and wasn't the
3766 // same as when the caller was compiled.
3767 testptr(method_result, method_result);
3768 jcc(Assembler::zero, L_no_such_interface);
3769 addptr(scan_temp, scan_step);
3770 }
3771
3772 bind(found_method);
3773
3774 if (return_method) {
3775 // Got a hit.
3776 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
3777 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3778 }
3779 }
3780
3781 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3782 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3783 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3784 // The target method is determined by <holder_klass, itable_index>.
3785 // The receiver klass is in recv_klass.
3786 // On success, the result will be in method_result, and execution falls through.
3787 // On failure, execution transfers to the given label.
3788 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3789 Register holder_klass,
3790 Register resolved_klass,
3791 Register method_result,
3792 Register scan_temp,
3793 Register temp_reg2,
3794 Register receiver,
3795 int itable_index,
3796 Label& L_no_such_interface) {
3797 assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
3798 Register temp_itbl_klass = method_result;
3799 Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
3800
3801 int vtable_base = in_bytes(Klass::vtable_start_offset());
3802 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3803 int scan_step = itableOffsetEntry::size() * wordSize;
3804 int vte_size = vtableEntry::size_in_bytes();
3805 int ioffset = in_bytes(itableOffsetEntry::interface_offset());
3806 int ooffset = in_bytes(itableOffsetEntry::offset_offset());
3807 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3808 assert(vte_size == wordSize, "adjust times_vte_scale");
3809
3810 Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
3811
3812 // temp_itbl_klass = recv_klass.itable[0]
3813 // scan_temp = &recv_klass.itable[0] + step
3814 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3815 movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
3816 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
3817 xorptr(temp_reg, temp_reg);
3818
3819 // Initial checks:
3820 // - if (holder_klass != resolved_klass), go to "scan for resolved"
3821 // - if (itable[0] == 0), no such interface
3822 // - if (itable[0] == holder_klass), shortcut to "holder found"
3823 cmpptr(holder_klass, resolved_klass);
3824 jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
3825 testptr(temp_itbl_klass, temp_itbl_klass);
3826 jccb(Assembler::zero, L_no_such_interface);
3827 cmpptr(holder_klass, temp_itbl_klass);
3828 jccb(Assembler::equal, L_holder_found);
3829
3830 // Loop: Look for holder_klass record in itable
3831 // do {
3832 // tmp = itable[index];
3833 // index += step;
3834 // if (tmp == holder_klass) {
3835 // goto L_holder_found; // Found!
3836 // }
3837 // } while (tmp != 0);
3838 // goto L_no_such_interface // Not found.
3839 Label L_scan_holder;
3840 bind(L_scan_holder);
3841 movptr(temp_itbl_klass, Address(scan_temp, 0));
3842 addptr(scan_temp, scan_step);
3843 cmpptr(holder_klass, temp_itbl_klass);
3844 jccb(Assembler::equal, L_holder_found);
3845 testptr(temp_itbl_klass, temp_itbl_klass);
3846 jccb(Assembler::notZero, L_scan_holder);
3847
3848 jmpb(L_no_such_interface);
3849
3850 // Loop: Look for resolved_class record in itable
3851 // do {
3852 // tmp = itable[index];
3853 // index += step;
3854 // if (tmp == holder_klass) {
3855 // // Also check if we have met a holder klass
3856 // holder_tmp = itable[index-step-ioffset];
3857 // }
3858 // if (tmp == resolved_klass) {
3859 // goto L_resolved_found; // Found!
3860 // }
3861 // } while (tmp != 0);
3862 // goto L_no_such_interface // Not found.
3863 //
3864 Label L_loop_scan_resolved;
3865 bind(L_loop_scan_resolved);
3866 movptr(temp_itbl_klass, Address(scan_temp, 0));
3867 addptr(scan_temp, scan_step);
3868 bind(L_loop_scan_resolved_entry);
3869 cmpptr(holder_klass, temp_itbl_klass);
3870 cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
3871 cmpptr(resolved_klass, temp_itbl_klass);
3872 jccb(Assembler::equal, L_resolved_found);
3873 testptr(temp_itbl_klass, temp_itbl_klass);
3874 jccb(Assembler::notZero, L_loop_scan_resolved);
3875
3876 jmpb(L_no_such_interface);
3877
3878 Label L_ready;
3879
3880 // See if we already have a holder klass. If not, go and scan for it.
3881 bind(L_resolved_found);
3882 testptr(temp_reg, temp_reg);
3883 jccb(Assembler::zero, L_scan_holder);
3884 jmpb(L_ready);
3885
3886 bind(L_holder_found);
3887 movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
3888
3889 // Finally, temp_reg contains holder_klass vtable offset
3890 bind(L_ready);
3891 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3892 if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
3893 load_klass(scan_temp, receiver, noreg);
3894 movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
3895 } else {
3896 movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
3897 }
3898 }
3899
3900
3901 // virtual method calling
3902 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3903 RegisterOrConstant vtable_index,
3904 Register method_result) {
3905 const ByteSize base = Klass::vtable_start_offset();
3906 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3907 Address vtable_entry_addr(recv_klass,
3908 vtable_index, Address::times_ptr,
3909 base + vtableEntry::method_offset());
3910 movptr(method_result, vtable_entry_addr);
3911 }
3912
3913
3914 void MacroAssembler::check_klass_subtype(Register sub_klass,
3915 Register super_klass,
3916 Register temp_reg,
3917 Label& L_success) {
3918 Label L_failure;
3919 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, nullptr);
3920 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
3921 bind(L_failure);
3922 }
3923
3924
3925 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3926 Register super_klass,
3927 Register temp_reg,
3928 Label* L_success,
3929 Label* L_failure,
3930 Label* L_slow_path,
3931 RegisterOrConstant super_check_offset) {
3932 assert_different_registers(sub_klass, super_klass, temp_reg);
3933 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3934 if (super_check_offset.is_register()) {
3935 assert_different_registers(sub_klass, super_klass,
3936 super_check_offset.as_register());
3937 } else if (must_load_sco) {
3938 assert(temp_reg != noreg, "supply either a temp or a register offset");
3939 }
3940
3941 Label L_fallthrough;
3942 int label_nulls = 0;
3943 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
3944 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
3945 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
3946 assert(label_nulls <= 1, "at most one null in the batch");
3947
3948 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3949 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3950 Address super_check_offset_addr(super_klass, sco_offset);
3951
3952 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3953 // range of a jccb. If this routine grows larger, reconsider at
3954 // least some of these.
3955 #define local_jcc(assembler_cond, label) \
3956 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
3957 else jcc( assembler_cond, label) /*omit semi*/
3958
3959 // Hacked jmp, which may only be used just before L_fallthrough.
3960 #define final_jmp(label) \
3961 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
3962 else jmp(label) /*omit semi*/
3963
3964 // If the pointers are equal, we are done (e.g., String[] elements).
3965 // This self-check enables sharing of secondary supertype arrays among
3966 // non-primary types such as array-of-interface. Otherwise, each such
3967 // type would need its own customized SSA.
3968 // We move this check to the front of the fast path because many
3969 // type checks are in fact trivially successful in this manner,
3970 // so we get a nicely predicted branch right at the start of the check.
3971 cmpptr(sub_klass, super_klass);
3972 local_jcc(Assembler::equal, *L_success);
3973
3974 // Check the supertype display:
3975 if (must_load_sco) {
3976 // Positive movl does right thing on LP64.
3977 movl(temp_reg, super_check_offset_addr);
3978 super_check_offset = RegisterOrConstant(temp_reg);
3979 }
3980 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3981 cmpptr(super_klass, super_check_addr); // load displayed supertype
3982
3983 // This check has worked decisively for primary supers.
3984 // Secondary supers are sought in the super_cache ('super_cache_addr').
3985 // (Secondary supers are interfaces and very deeply nested subtypes.)
3986 // This works in the same check above because of a tricky aliasing
3987 // between the super_cache and the primary super display elements.
3988 // (The 'super_check_addr' can address either, as the case requires.)
3989 // Note that the cache is updated below if it does not help us find
3990 // what we need immediately.
3991 // So if it was a primary super, we can just fail immediately.
3992 // Otherwise, it's the slow path for us (no success at this point).
3993
3994 if (super_check_offset.is_register()) {
3995 local_jcc(Assembler::equal, *L_success);
3996 cmpl(super_check_offset.as_register(), sc_offset);
3997 if (L_failure == &L_fallthrough) {
3998 local_jcc(Assembler::equal, *L_slow_path);
3999 } else {
4000 local_jcc(Assembler::notEqual, *L_failure);
4001 final_jmp(*L_slow_path);
4002 }
4003 } else if (super_check_offset.as_constant() == sc_offset) {
4004 // Need a slow path; fast failure is impossible.
4005 if (L_slow_path == &L_fallthrough) {
4006 local_jcc(Assembler::equal, *L_success);
4007 } else {
4008 local_jcc(Assembler::notEqual, *L_slow_path);
4009 final_jmp(*L_success);
4010 }
4011 } else {
4012 // No slow path; it's a fast decision.
4013 if (L_failure == &L_fallthrough) {
4014 local_jcc(Assembler::equal, *L_success);
4015 } else {
4016 local_jcc(Assembler::notEqual, *L_failure);
4017 final_jmp(*L_success);
4018 }
4019 }
4020
4021 bind(L_fallthrough);
4022
4023 #undef local_jcc
4024 #undef final_jmp
4025 }
4026
4027
4028 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4029 Register super_klass,
4030 Register temp_reg,
4031 Register temp2_reg,
4032 Label* L_success,
4033 Label* L_failure,
4034 bool set_cond_codes) {
4035 assert_different_registers(sub_klass, super_klass, temp_reg);
4036 if (temp2_reg != noreg)
4037 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4038 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4039
4040 Label L_fallthrough;
4041 int label_nulls = 0;
4042 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4043 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4044 assert(label_nulls <= 1, "at most one null in the batch");
4045
4046 // a couple of useful fields in sub_klass:
4047 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4048 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4049 Address secondary_supers_addr(sub_klass, ss_offset);
4050 Address super_cache_addr( sub_klass, sc_offset);
4051
4052 // Do a linear scan of the secondary super-klass chain.
4053 // This code is rarely used, so simplicity is a virtue here.
4054 // The repne_scan instruction uses fixed registers, which we must spill.
4055 // Don't worry too much about pre-existing connections with the input regs.
4056
4057 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4058 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4059
4060 // Get super_klass value into rax (even if it was in rdi or rcx).
4061 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4062 if (super_klass != rax) {
4063 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4064 mov(rax, super_klass);
4065 }
4066 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4067 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4068
4069 #ifndef PRODUCT
4070 uint* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4071 ExternalAddress pst_counter_addr((address) pst_counter);
4072 lea(rcx, pst_counter_addr);
4073 incrementl(Address(rcx, 0));
4074 #endif //PRODUCT
4075
4076 // We will consult the secondary-super array.
4077 movptr(rdi, secondary_supers_addr);
4078 // Load the array length. (Positive movl does right thing on LP64.)
4079 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4080 // Skip to start of data.
4081 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4082
4083 // Scan RCX words at [RDI] for an occurrence of RAX.
4084 // Set NZ/Z based on last compare.
4085 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4086 // not change flags (only scas instruction which is repeated sets flags).
4087 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4088
4089 testptr(rax,rax); // Set Z = 0
4090 repne_scan();
4091
4092 // Unspill the temp. registers:
4093 if (pushed_rdi) pop(rdi);
4094 if (pushed_rcx) pop(rcx);
4095 if (pushed_rax) pop(rax);
4096
4097 if (set_cond_codes) {
4098 // Special hack for the AD files: rdi is guaranteed non-zero.
4099 assert(!pushed_rdi, "rdi must be left non-null");
4100 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4101 }
4102
4103 if (L_failure == &L_fallthrough)
4104 jccb(Assembler::notEqual, *L_failure);
4105 else jcc(Assembler::notEqual, *L_failure);
4106
4107 // Success. Cache the super we found and proceed in triumph.
4108 movptr(super_cache_addr, super_klass);
4109
4110 if (L_success != &L_fallthrough) {
4111 jmp(*L_success);
4112 }
4113
4114 #undef IS_A_TEMP
4115
4116 bind(L_fallthrough);
4117 }
4118
4119 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4120 Register super_klass,
4121 Register temp_reg,
4122 Register temp2_reg,
4123 Label* L_success,
4124 Label* L_failure,
4125 bool set_cond_codes) {
4126 assert(set_cond_codes == false, "must be false on 64-bit x86");
4127 check_klass_subtype_slow_path
4128 (sub_klass, super_klass, temp_reg, temp2_reg, noreg, noreg,
4129 L_success, L_failure);
4130 }
4131
4132 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4133 Register super_klass,
4134 Register temp_reg,
4135 Register temp2_reg,
4136 Register temp3_reg,
4137 Register temp4_reg,
4138 Label* L_success,
4139 Label* L_failure) {
4140 if (UseSecondarySupersTable) {
4141 check_klass_subtype_slow_path_table
4142 (sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, temp4_reg,
4143 L_success, L_failure);
4144 } else {
4145 check_klass_subtype_slow_path_linear
4146 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, /*set_cond_codes*/false);
4147 }
4148 }
4149
4150 Register MacroAssembler::allocate_if_noreg(Register r,
4151 RegSetIterator<Register> &available_regs,
4152 RegSet ®s_to_push) {
4153 if (!r->is_valid()) {
4154 r = *available_regs++;
4155 regs_to_push += r;
4156 }
4157 return r;
4158 }
4159
4160 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4161 Register super_klass,
4162 Register temp_reg,
4163 Register temp2_reg,
4164 Register temp3_reg,
4165 Register result_reg,
4166 Label* L_success,
4167 Label* L_failure) {
4168 // NB! Callers may assume that, when temp2_reg is a valid register,
4169 // this code sets it to a nonzero value.
4170 bool temp2_reg_was_valid = temp2_reg->is_valid();
4171
4172 RegSet temps = RegSet::of(temp_reg, temp2_reg, temp3_reg);
4173
4174 Label L_fallthrough;
4175 int label_nulls = 0;
4176 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4177 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4178 assert(label_nulls <= 1, "at most one null in the batch");
4179
4180 BLOCK_COMMENT("check_klass_subtype_slow_path_table");
4181
4182 RegSetIterator<Register> available_regs
4183 = (RegSet::of(rax, rcx, rdx, r8) + r9 + r10 + r11 + r12 - temps - sub_klass - super_klass).begin();
4184
4185 RegSet pushed_regs;
4186
4187 temp_reg = allocate_if_noreg(temp_reg, available_regs, pushed_regs);
4188 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
4189 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
4190 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4191 Register temp4_reg = allocate_if_noreg(noreg, available_regs, pushed_regs);
4192
4193 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, result_reg);
4194
4195 {
4196
4197 int register_push_size = pushed_regs.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4198 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4199 subptr(rsp, aligned_size);
4200 push_set(pushed_regs, 0);
4201
4202 lookup_secondary_supers_table_var(sub_klass,
4203 super_klass,
4204 temp_reg, temp2_reg, temp3_reg, temp4_reg, result_reg);
4205 cmpq(result_reg, 0);
4206
4207 // Unspill the temp. registers:
4208 pop_set(pushed_regs, 0);
4209 // Increment SP but do not clobber flags.
4210 lea(rsp, Address(rsp, aligned_size));
4211 }
4212
4213 if (temp2_reg_was_valid) {
4214 movq(temp2_reg, 1);
4215 }
4216
4217 jcc(Assembler::notEqual, *L_failure);
4218
4219 if (L_success != &L_fallthrough) {
4220 jmp(*L_success);
4221 }
4222
4223 bind(L_fallthrough);
4224 }
4225
4226 // population_count variant for running without the POPCNT
4227 // instruction, which was introduced with SSE4.2 in 2008.
4228 void MacroAssembler::population_count(Register dst, Register src,
4229 Register scratch1, Register scratch2) {
4230 assert_different_registers(src, scratch1, scratch2);
4231 if (UsePopCountInstruction) {
4232 Assembler::popcntq(dst, src);
4233 } else {
4234 assert_different_registers(src, scratch1, scratch2);
4235 assert_different_registers(dst, scratch1, scratch2);
4236 Label loop, done;
4237
4238 mov(scratch1, src);
4239 // dst = 0;
4240 // while(scratch1 != 0) {
4241 // dst++;
4242 // scratch1 &= (scratch1 - 1);
4243 // }
4244 xorl(dst, dst);
4245 testq(scratch1, scratch1);
4246 jccb(Assembler::equal, done);
4247 {
4248 bind(loop);
4249 incq(dst);
4250 movq(scratch2, scratch1);
4251 decq(scratch2);
4252 andq(scratch1, scratch2);
4253 jccb(Assembler::notEqual, loop);
4254 }
4255 bind(done);
4256 }
4257 #ifdef ASSERT
4258 mov64(scratch1, 0xCafeBabeDeadBeef);
4259 movq(scratch2, scratch1);
4260 #endif
4261 }
4262
4263 // Ensure that the inline code and the stub are using the same registers.
4264 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
4265 do { \
4266 assert(r_super_klass == rax, "mismatch"); \
4267 assert(r_array_base == rbx, "mismatch"); \
4268 assert(r_array_length == rcx, "mismatch"); \
4269 assert(r_array_index == rdx, "mismatch"); \
4270 assert(r_sub_klass == rsi || r_sub_klass == noreg, "mismatch"); \
4271 assert(r_bitmap == r11 || r_bitmap == noreg, "mismatch"); \
4272 assert(result == rdi || result == noreg, "mismatch"); \
4273 } while(0)
4274
4275 // Versions of salq and rorq that don't need count to be in rcx
4276
4277 void MacroAssembler::salq(Register dest, Register count) {
4278 if (count == rcx) {
4279 Assembler::salq(dest);
4280 } else {
4281 assert_different_registers(rcx, dest);
4282 xchgq(rcx, count);
4283 Assembler::salq(dest);
4284 xchgq(rcx, count);
4285 }
4286 }
4287
4288 void MacroAssembler::rorq(Register dest, Register count) {
4289 if (count == rcx) {
4290 Assembler::rorq(dest);
4291 } else {
4292 assert_different_registers(rcx, dest);
4293 xchgq(rcx, count);
4294 Assembler::rorq(dest);
4295 xchgq(rcx, count);
4296 }
4297 }
4298
4299 // Return true: we succeeded in generating this code
4300 //
4301 // At runtime, return 0 in result if r_super_klass is a superclass of
4302 // r_sub_klass, otherwise return nonzero. Use this if you know the
4303 // super_klass_slot of the class you're looking for. This is always
4304 // the case for instanceof and checkcast.
4305 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4306 Register r_super_klass,
4307 Register temp1,
4308 Register temp2,
4309 Register temp3,
4310 Register temp4,
4311 Register result,
4312 u1 super_klass_slot) {
4313 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4314
4315 Label L_fallthrough, L_success, L_failure;
4316
4317 BLOCK_COMMENT("lookup_secondary_supers_table {");
4318
4319 const Register
4320 r_array_index = temp1,
4321 r_array_length = temp2,
4322 r_array_base = temp3,
4323 r_bitmap = temp4;
4324
4325 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4326
4327 xorq(result, result); // = 0
4328
4329 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4330 movq(r_array_index, r_bitmap);
4331
4332 // First check the bitmap to see if super_klass might be present. If
4333 // the bit is zero, we are certain that super_klass is not one of
4334 // the secondary supers.
4335 u1 bit = super_klass_slot;
4336 {
4337 // NB: If the count in a x86 shift instruction is 0, the flags are
4338 // not affected, so we do a testq instead.
4339 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
4340 if (shift_count != 0) {
4341 salq(r_array_index, shift_count);
4342 } else {
4343 testq(r_array_index, r_array_index);
4344 }
4345 }
4346 // We test the MSB of r_array_index, i.e. its sign bit
4347 jcc(Assembler::positive, L_failure);
4348
4349 // Get the first array index that can contain super_klass into r_array_index.
4350 if (bit != 0) {
4351 population_count(r_array_index, r_array_index, temp2, temp3);
4352 } else {
4353 movl(r_array_index, 1);
4354 }
4355 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4356
4357 // We will consult the secondary-super array.
4358 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4359
4360 // We're asserting that the first word in an Array<Klass*> is the
4361 // length, and the second word is the first word of the data. If
4362 // that ever changes, r_array_base will have to be adjusted here.
4363 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4364 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4365
4366 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4367 jccb(Assembler::equal, L_success);
4368
4369 // Is there another entry to check? Consult the bitmap.
4370 btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4371 jccb(Assembler::carryClear, L_failure);
4372
4373 // Linear probe. Rotate the bitmap so that the next bit to test is
4374 // in Bit 1.
4375 if (bit != 0) {
4376 rorq(r_bitmap, bit);
4377 }
4378
4379 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4380 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4381 // Kills: r_array_length.
4382 // Returns: result.
4383 call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub()));
4384 // Result (0/1) is in rdi
4385 jmpb(L_fallthrough);
4386
4387 bind(L_failure);
4388 incq(result); // 0 => 1
4389
4390 bind(L_success);
4391 // result = 0;
4392
4393 bind(L_fallthrough);
4394 BLOCK_COMMENT("} lookup_secondary_supers_table");
4395
4396 if (VerifySecondarySupers) {
4397 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4398 temp1, temp2, temp3);
4399 }
4400 }
4401
4402 // At runtime, return 0 in result if r_super_klass is a superclass of
4403 // r_sub_klass, otherwise return nonzero. Use this version of
4404 // lookup_secondary_supers_table() if you don't know ahead of time
4405 // which superclass will be searched for. Used by interpreter and
4406 // runtime stubs. It is larger and has somewhat greater latency than
4407 // the version above, which takes a constant super_klass_slot.
4408 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4409 Register r_super_klass,
4410 Register temp1,
4411 Register temp2,
4412 Register temp3,
4413 Register temp4,
4414 Register result) {
4415 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4416 assert_different_registers(r_sub_klass, r_super_klass, rcx);
4417 RegSet temps = RegSet::of(temp1, temp2, temp3, temp4);
4418
4419 Label L_fallthrough, L_success, L_failure;
4420
4421 BLOCK_COMMENT("lookup_secondary_supers_table {");
4422
4423 RegSetIterator<Register> available_regs = (temps - rcx).begin();
4424
4425 // FIXME. Once we are sure that all paths reaching this point really
4426 // do pass rcx as one of our temps we can get rid of the following
4427 // workaround.
4428 assert(temps.contains(rcx), "fix this code");
4429
4430 // We prefer to have our shift count in rcx. If rcx is one of our
4431 // temps, use it for slot. If not, pick any of our temps.
4432 Register slot;
4433 if (!temps.contains(rcx)) {
4434 slot = *available_regs++;
4435 } else {
4436 slot = rcx;
4437 }
4438
4439 const Register r_array_index = *available_regs++;
4440 const Register r_bitmap = *available_regs++;
4441
4442 // The logic above guarantees this property, but we state it here.
4443 assert_different_registers(r_array_index, r_bitmap, rcx);
4444
4445 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4446 movq(r_array_index, r_bitmap);
4447
4448 // First check the bitmap to see if super_klass might be present. If
4449 // the bit is zero, we are certain that super_klass is not one of
4450 // the secondary supers.
4451 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4452 xorl(slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1)); // slot ^ 63 === 63 - slot (mod 64)
4453 salq(r_array_index, slot);
4454
4455 testq(r_array_index, r_array_index);
4456 // We test the MSB of r_array_index, i.e. its sign bit
4457 jcc(Assembler::positive, L_failure);
4458
4459 const Register r_array_base = *available_regs++;
4460
4461 // Get the first array index that can contain super_klass into r_array_index.
4462 // Note: Clobbers r_array_base and slot.
4463 population_count(r_array_index, r_array_index, /*temp2*/r_array_base, /*temp3*/slot);
4464
4465 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4466
4467 // We will consult the secondary-super array.
4468 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4469
4470 // We're asserting that the first word in an Array<Klass*> is the
4471 // length, and the second word is the first word of the data. If
4472 // that ever changes, r_array_base will have to be adjusted here.
4473 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4474 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4475
4476 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4477 jccb(Assembler::equal, L_success);
4478
4479 // Restore slot to its true value
4480 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4481
4482 // Linear probe. Rotate the bitmap so that the next bit to test is
4483 // in Bit 1.
4484 rorq(r_bitmap, slot);
4485
4486 // Is there another entry to check? Consult the bitmap.
4487 btq(r_bitmap, 1);
4488 jccb(Assembler::carryClear, L_failure);
4489
4490 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4491 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4492 // Kills: r_array_length.
4493 // Returns: result.
4494 lookup_secondary_supers_table_slow_path(r_super_klass,
4495 r_array_base,
4496 r_array_index,
4497 r_bitmap,
4498 /*temp1*/result,
4499 /*temp2*/slot,
4500 &L_success,
4501 nullptr);
4502
4503 bind(L_failure);
4504 movq(result, 1);
4505 jmpb(L_fallthrough);
4506
4507 bind(L_success);
4508 xorq(result, result); // = 0
4509
4510 bind(L_fallthrough);
4511 BLOCK_COMMENT("} lookup_secondary_supers_table");
4512
4513 if (VerifySecondarySupers) {
4514 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4515 temp1, temp2, temp3);
4516 }
4517 }
4518
4519 void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit,
4520 Label* L_success, Label* L_failure) {
4521 Label L_loop, L_fallthrough;
4522 {
4523 int label_nulls = 0;
4524 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4525 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4526 assert(label_nulls <= 1, "at most one null in the batch");
4527 }
4528 bind(L_loop);
4529 cmpq(value, Address(addr, count, Address::times_8));
4530 jcc(Assembler::equal, *L_success);
4531 addl(count, 1);
4532 cmpl(count, limit);
4533 jcc(Assembler::less, L_loop);
4534
4535 if (&L_fallthrough != L_failure) {
4536 jmp(*L_failure);
4537 }
4538 bind(L_fallthrough);
4539 }
4540
4541 // Called by code generated by check_klass_subtype_slow_path
4542 // above. This is called when there is a collision in the hashed
4543 // lookup in the secondary supers array.
4544 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4545 Register r_array_base,
4546 Register r_array_index,
4547 Register r_bitmap,
4548 Register temp1,
4549 Register temp2,
4550 Label* L_success,
4551 Label* L_failure) {
4552 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2);
4553
4554 const Register
4555 r_array_length = temp1,
4556 r_sub_klass = noreg,
4557 result = noreg;
4558
4559 Label L_fallthrough;
4560 int label_nulls = 0;
4561 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4562 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4563 assert(label_nulls <= 1, "at most one null in the batch");
4564
4565 // Load the array length.
4566 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4567 // And adjust the array base to point to the data.
4568 // NB! Effectively increments current slot index by 1.
4569 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4570 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4571
4572 // Linear probe
4573 Label L_huge;
4574
4575 // The bitmap is full to bursting.
4576 // Implicit invariant: BITMAP_FULL implies (length > 0)
4577 cmpl(r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4578 jcc(Assembler::greater, L_huge);
4579
4580 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4581 // current slot (at secondary_supers[r_array_index]) has not yet
4582 // been inspected, and r_array_index may be out of bounds if we
4583 // wrapped around the end of the array.
4584
4585 { // This is conventional linear probing, but instead of terminating
4586 // when a null entry is found in the table, we maintain a bitmap
4587 // in which a 0 indicates missing entries.
4588 // The check above guarantees there are 0s in the bitmap, so the loop
4589 // eventually terminates.
4590
4591 xorl(temp2, temp2); // = 0;
4592
4593 Label L_again;
4594 bind(L_again);
4595
4596 // Check for array wraparound.
4597 cmpl(r_array_index, r_array_length);
4598 cmovl(Assembler::greaterEqual, r_array_index, temp2);
4599
4600 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4601 jcc(Assembler::equal, *L_success);
4602
4603 // If the next bit in bitmap is zero, we're done.
4604 btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now
4605 jcc(Assembler::carryClear, *L_failure);
4606
4607 rorq(r_bitmap, 1); // Bits 1/2 => 0/1
4608 addl(r_array_index, 1);
4609
4610 jmp(L_again);
4611 }
4612
4613 { // Degenerate case: more than 64 secondary supers.
4614 // FIXME: We could do something smarter here, maybe a vectorized
4615 // comparison or a binary search, but is that worth any added
4616 // complexity?
4617 bind(L_huge);
4618 xorl(r_array_index, r_array_index); // = 0
4619 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length,
4620 L_success,
4621 (&L_fallthrough != L_failure ? L_failure : nullptr));
4622
4623 bind(L_fallthrough);
4624 }
4625 }
4626
4627 struct VerifyHelperArguments {
4628 Klass* _super;
4629 Klass* _sub;
4630 intptr_t _linear_result;
4631 intptr_t _table_result;
4632 };
4633
4634 static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) {
4635 Klass::on_secondary_supers_verification_failure(args->_super,
4636 args->_sub,
4637 args->_linear_result,
4638 args->_table_result,
4639 msg);
4640 }
4641
4642 // Make sure that the hashed lookup and a linear scan agree.
4643 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4644 Register r_super_klass,
4645 Register result,
4646 Register temp1,
4647 Register temp2,
4648 Register temp3) {
4649 const Register
4650 r_array_index = temp1,
4651 r_array_length = temp2,
4652 r_array_base = temp3,
4653 r_bitmap = noreg;
4654
4655 BLOCK_COMMENT("verify_secondary_supers_table {");
4656
4657 Label L_success, L_failure, L_check, L_done;
4658
4659 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4660 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4661 // And adjust the array base to point to the data.
4662 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4663
4664 testl(r_array_length, r_array_length); // array_length == 0?
4665 jcc(Assembler::zero, L_failure);
4666
4667 movl(r_array_index, 0);
4668 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success);
4669 // fall through to L_failure
4670
4671 const Register linear_result = r_array_index; // reuse temp1
4672
4673 bind(L_failure); // not present
4674 movl(linear_result, 1);
4675 jmp(L_check);
4676
4677 bind(L_success); // present
4678 movl(linear_result, 0);
4679
4680 bind(L_check);
4681 cmpl(linear_result, result);
4682 jcc(Assembler::equal, L_done);
4683
4684 { // To avoid calling convention issues, build a record on the stack
4685 // and pass the pointer to that instead.
4686 push(result);
4687 push(linear_result);
4688 push(r_sub_klass);
4689 push(r_super_klass);
4690 movptr(c_rarg1, rsp);
4691 movptr(c_rarg0, (uintptr_t) "mismatch");
4692 call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper)));
4693 should_not_reach_here();
4694 }
4695 bind(L_done);
4696
4697 BLOCK_COMMENT("} verify_secondary_supers_table");
4698 }
4699
4700 #undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS
4701
4702 void MacroAssembler::clinit_barrier(Register klass, Label* L_fast_path, Label* L_slow_path) {
4703 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
4704
4705 Label L_fallthrough;
4706 if (L_fast_path == nullptr) {
4707 L_fast_path = &L_fallthrough;
4708 } else if (L_slow_path == nullptr) {
4709 L_slow_path = &L_fallthrough;
4710 }
4711
4712 // Fast path check: class is fully initialized.
4713 // init_state needs acquire, but x86 is TSO, and so we are already good.
4714 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4715 jcc(Assembler::equal, *L_fast_path);
4716
4717 // Fast path check: current thread is initializer thread
4718 cmpptr(r15_thread, Address(klass, InstanceKlass::init_thread_offset()));
4719 if (L_slow_path == &L_fallthrough) {
4720 jcc(Assembler::equal, *L_fast_path);
4721 bind(*L_slow_path);
4722 } else if (L_fast_path == &L_fallthrough) {
4723 jcc(Assembler::notEqual, *L_slow_path);
4724 bind(*L_fast_path);
4725 } else {
4726 Unimplemented();
4727 }
4728 }
4729
4730 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4731 if (VM_Version::supports_cmov()) {
4732 cmovl(cc, dst, src);
4733 } else {
4734 Label L;
4735 jccb(negate_condition(cc), L);
4736 movl(dst, src);
4737 bind(L);
4738 }
4739 }
4740
4741 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4742 if (VM_Version::supports_cmov()) {
4743 cmovl(cc, dst, src);
4744 } else {
4745 Label L;
4746 jccb(negate_condition(cc), L);
4747 movl(dst, src);
4748 bind(L);
4749 }
4750 }
4751
4752 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4753 if (!VerifyOops) return;
4754
4755 BLOCK_COMMENT("verify_oop {");
4756 push(rscratch1);
4757 push(rax); // save rax
4758 push(reg); // pass register argument
4759
4760 // Pass register number to verify_oop_subroutine
4761 const char* b = nullptr;
4762 {
4763 ResourceMark rm;
4764 stringStream ss;
4765 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4766 b = code_string(ss.as_string());
4767 }
4768 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
4769 pushptr(buffer.addr(), rscratch1);
4770
4771 // call indirectly to solve generation ordering problem
4772 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4773 call(rax);
4774 // Caller pops the arguments (oop, message) and restores rax, r10
4775 BLOCK_COMMENT("} verify_oop");
4776 }
4777
4778 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4779 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4780 // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
4781 // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
4782 vpternlogd(dst, 0xFF, dst, dst, vector_len);
4783 } else if (VM_Version::supports_avx()) {
4784 vpcmpeqd(dst, dst, dst, vector_len);
4785 } else {
4786 pcmpeqd(dst, dst);
4787 }
4788 }
4789
4790 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4791 int extra_slot_offset) {
4792 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4793 int stackElementSize = Interpreter::stackElementSize;
4794 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4795 #ifdef ASSERT
4796 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4797 assert(offset1 - offset == stackElementSize, "correct arithmetic");
4798 #endif
4799 Register scale_reg = noreg;
4800 Address::ScaleFactor scale_factor = Address::no_scale;
4801 if (arg_slot.is_constant()) {
4802 offset += arg_slot.as_constant() * stackElementSize;
4803 } else {
4804 scale_reg = arg_slot.as_register();
4805 scale_factor = Address::times(stackElementSize);
4806 }
4807 offset += wordSize; // return PC is on stack
4808 return Address(rsp, scale_reg, scale_factor, offset);
4809 }
4810
4811 // Handle the receiver type profile update given the "recv" klass.
4812 //
4813 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
4814 // If there are no matching or claimable receiver entries in RD, updates
4815 // the polymorphic counter.
4816 //
4817 // This code expected to run by either the interpreter or JIT-ed code, without
4818 // extra synchronization. For safety, receiver cells are claimed atomically, which
4819 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
4820 // counter updates are not atomic.
4821 //
4822 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
4823 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
4824 int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
4825 int poly_count_offset = in_bytes(CounterData::count_offset());
4826 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
4827 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
4828
4829 // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
4830 assert(is_aligned(mdp_offset, BytesPerWord), "sanity");
4831 base_receiver_offset += mdp_offset;
4832 end_receiver_offset += mdp_offset;
4833 poly_count_offset += mdp_offset;
4834
4835 // Scale down to optimize encoding. Slots are pointer-sized.
4836 assert(is_aligned(base_receiver_offset, BytesPerWord), "sanity");
4837 assert(is_aligned(end_receiver_offset, BytesPerWord), "sanity");
4838 assert(is_aligned(poly_count_offset, BytesPerWord), "sanity");
4839 assert(is_aligned(receiver_step, BytesPerWord), "sanity");
4840 assert(is_aligned(receiver_to_count_step, BytesPerWord), "sanity");
4841 base_receiver_offset >>= LogBytesPerWord;
4842 end_receiver_offset >>= LogBytesPerWord;
4843 poly_count_offset >>= LogBytesPerWord;
4844 receiver_step >>= LogBytesPerWord;
4845 receiver_to_count_step >>= LogBytesPerWord;
4846
4847 #ifdef ASSERT
4848 // We are about to walk the MDO slots without asking for offsets.
4849 // Check that our math hits all the right spots.
4850 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
4851 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
4852 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
4853 int offset = base_receiver_offset + receiver_step*c;
4854 int count_offset = offset + receiver_to_count_step;
4855 assert((offset << LogBytesPerWord) == real_recv_offset, "receiver slot math");
4856 assert((count_offset << LogBytesPerWord) == real_count_offset, "receiver count math");
4857 }
4858 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
4859 assert(poly_count_offset << LogBytesPerWord == real_poly_count_offset, "poly counter math");
4860 #endif
4861
4862 // Corner case: no profile table. Increment poly counter and exit.
4863 if (ReceiverTypeData::row_limit() == 0) {
4864 addptr(Address(mdp, poly_count_offset, Address::times_ptr), DataLayout::counter_increment);
4865 return;
4866 }
4867
4868 Register offset = rscratch1;
4869
4870 Label L_loop_search_receiver, L_loop_search_empty;
4871 Label L_restart, L_found_recv, L_found_empty, L_polymorphic, L_count_update;
4872
4873 // The code here recognizes three major cases:
4874 // A. Fastest: receiver found in the table
4875 // B. Fast: no receiver in the table, and the table is full
4876 // C. Slow: no receiver in the table, free slots in the table
4877 //
4878 // The case A performance is most important, as perfectly-behaved code would end up
4879 // there, especially with larger TypeProfileWidth. The case B performance is
4880 // important as well, this is where bulk of code would land for normally megamorphic
4881 // cases. The case C performance is not essential, its job is to deal with installation
4882 // races, we optimize for code density instead. Case C needs to make sure that receiver
4883 // rows are only claimed once. This makes sure we never overwrite a row for another
4884 // receiver and never duplicate the receivers in the list, making profile type-accurate.
4885 //
4886 // It is very tempting to handle these cases in a single loop, and claim the first slot
4887 // without checking the rest of the table. But, profiling code should tolerate free slots
4888 // in the table, as class unloading can clear them. After such cleanup, the receiver
4889 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
4890 // to complete, before trying to install new slots. Splitting the code in several tight
4891 // loops also helpfully optimizes for cases A and B.
4892 //
4893 // This code is effectively:
4894 //
4895 // restart:
4896 // // Fastest: receiver is already installed
4897 // for (i = 0; i < receiver_count(); i++) {
4898 // if (receiver(i) == recv) goto found_recv(i);
4899 // }
4900 //
4901 // // Fast: no receiver, but profile is full
4902 // for (i = 0; i < receiver_count(); i++) {
4903 // if (receiver(i) == null) goto found_null(i);
4904 // }
4905 // goto polymorphic
4906 //
4907 // // Slow: try to install receiver
4908 // found_null(i):
4909 // CAS(&receiver(i), null, recv);
4910 // goto restart
4911 //
4912 // polymorphic:
4913 // count++;
4914 // return
4915 //
4916 // found_recv(i):
4917 // *receiver_count(i)++
4918 //
4919
4920 bind(L_restart);
4921
4922 // Fastest: receiver is already installed
4923 movptr(offset, base_receiver_offset);
4924 bind(L_loop_search_receiver);
4925 cmpptr(recv, Address(mdp, offset, Address::times_ptr));
4926 jccb(Assembler::equal, L_found_recv);
4927 addptr(offset, receiver_step);
4928 cmpptr(offset, end_receiver_offset);
4929 jccb(Assembler::notEqual, L_loop_search_receiver);
4930
4931 // Fast: no receiver, but profile is full
4932 movptr(offset, base_receiver_offset);
4933 bind(L_loop_search_empty);
4934 cmpptr(Address(mdp, offset, Address::times_ptr), NULL_WORD);
4935 jccb(Assembler::equal, L_found_empty);
4936 addptr(offset, receiver_step);
4937 cmpptr(offset, end_receiver_offset);
4938 jccb(Assembler::notEqual, L_loop_search_empty);
4939 jmpb(L_polymorphic);
4940
4941 // Slow: try to install receiver
4942 bind(L_found_empty);
4943
4944 // Atomically swing receiver slot: null -> recv.
4945 //
4946 // The update code uses CAS, which wants RAX register specifically, *and* it needs
4947 // other important registers untouched, as they form the address. Therefore, we need
4948 // to shift any important registers from RAX into some other spare register. If we
4949 // have a spare register, we are forced to save it on stack here.
4950
4951 Register spare_reg = noreg;
4952 Register shifted_mdp = mdp;
4953 Register shifted_recv = recv;
4954 if (recv == rax || mdp == rax) {
4955 spare_reg = (recv != rbx && mdp != rbx) ? rbx :
4956 (recv != rcx && mdp != rcx) ? rcx :
4957 rdx;
4958 assert_different_registers(mdp, recv, offset, spare_reg);
4959
4960 push(spare_reg);
4961 if (recv == rax) {
4962 movptr(spare_reg, recv);
4963 shifted_recv = spare_reg;
4964 } else {
4965 assert(mdp == rax, "Remaining case");
4966 movptr(spare_reg, mdp);
4967 shifted_mdp = spare_reg;
4968 }
4969 } else {
4970 push(rax);
4971 }
4972
4973 // None of the important registers are in RAX after this shuffle.
4974 assert_different_registers(rax, shifted_mdp, shifted_recv, offset);
4975
4976 xorptr(rax, rax);
4977 cmpxchgptr(shifted_recv, Address(shifted_mdp, offset, Address::times_ptr));
4978
4979 // Unshift registers.
4980 if (recv == rax || mdp == rax) {
4981 movptr(rax, spare_reg);
4982 pop(spare_reg);
4983 } else {
4984 pop(rax);
4985 }
4986
4987 // CAS success means the slot now has the receiver we want. CAS failure means
4988 // something had claimed the slot concurrently: it can be the same receiver we want,
4989 // or something else. Since this is a slow path, we can optimize for code density,
4990 // and just restart the search from the beginning.
4991 jmpb(L_restart);
4992
4993 // Counter updates:
4994
4995 // Increment polymorphic counter instead of receiver slot.
4996 bind(L_polymorphic);
4997 movptr(offset, poly_count_offset);
4998 jmpb(L_count_update);
4999
5000 // Found a receiver, convert its slot offset to corresponding count offset.
5001 bind(L_found_recv);
5002 addptr(offset, receiver_to_count_step);
5003
5004 bind(L_count_update);
5005 addptr(Address(mdp, offset, Address::times_ptr), DataLayout::counter_increment);
5006 }
5007
5008 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
5009 if (!VerifyOops) return;
5010
5011 push(rscratch1);
5012 push(rax); // save rax,
5013 // addr may contain rsp so we will have to adjust it based on the push
5014 // we just did (and on 64 bit we do two pushes)
5015 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5016 // stores rax into addr which is backwards of what was intended.
5017 if (addr.uses(rsp)) {
5018 lea(rax, addr);
5019 pushptr(Address(rax, 2 * BytesPerWord));
5020 } else {
5021 pushptr(addr);
5022 }
5023
5024 // Pass register number to verify_oop_subroutine
5025 const char* b = nullptr;
5026 {
5027 ResourceMark rm;
5028 stringStream ss;
5029 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
5030 b = code_string(ss.as_string());
5031 }
5032 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5033 pushptr(buffer.addr(), rscratch1);
5034
5035 // call indirectly to solve generation ordering problem
5036 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5037 call(rax);
5038 // Caller pops the arguments (addr, message) and restores rax, r10.
5039 }
5040
5041 void MacroAssembler::verify_tlab() {
5042 #ifdef ASSERT
5043 if (UseTLAB && VerifyOops) {
5044 Label next, ok;
5045 Register t1 = rsi;
5046
5047 push(t1);
5048
5049 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5050 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_start_offset())));
5051 jcc(Assembler::aboveEqual, next);
5052 STOP("assert(top >= start)");
5053 should_not_reach_here();
5054
5055 bind(next);
5056 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_end_offset())));
5057 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5058 jcc(Assembler::aboveEqual, ok);
5059 STOP("assert(top <= end)");
5060 should_not_reach_here();
5061
5062 bind(ok);
5063 pop(t1);
5064 }
5065 #endif
5066 }
5067
5068 class ControlWord {
5069 public:
5070 int32_t _value;
5071
5072 int rounding_control() const { return (_value >> 10) & 3 ; }
5073 int precision_control() const { return (_value >> 8) & 3 ; }
5074 bool precision() const { return ((_value >> 5) & 1) != 0; }
5075 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5076 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5077 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5078 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5079 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5080
5081 void print() const {
5082 // rounding control
5083 const char* rc;
5084 switch (rounding_control()) {
5085 case 0: rc = "round near"; break;
5086 case 1: rc = "round down"; break;
5087 case 2: rc = "round up "; break;
5088 case 3: rc = "chop "; break;
5089 default:
5090 rc = nullptr; // silence compiler warnings
5091 fatal("Unknown rounding control: %d", rounding_control());
5092 };
5093 // precision control
5094 const char* pc;
5095 switch (precision_control()) {
5096 case 0: pc = "24 bits "; break;
5097 case 1: pc = "reserved"; break;
5098 case 2: pc = "53 bits "; break;
5099 case 3: pc = "64 bits "; break;
5100 default:
5101 pc = nullptr; // silence compiler warnings
5102 fatal("Unknown precision control: %d", precision_control());
5103 };
5104 // flags
5105 char f[9];
5106 f[0] = ' ';
5107 f[1] = ' ';
5108 f[2] = (precision ()) ? 'P' : 'p';
5109 f[3] = (underflow ()) ? 'U' : 'u';
5110 f[4] = (overflow ()) ? 'O' : 'o';
5111 f[5] = (zero_divide ()) ? 'Z' : 'z';
5112 f[6] = (denormalized()) ? 'D' : 'd';
5113 f[7] = (invalid ()) ? 'I' : 'i';
5114 f[8] = '\x0';
5115 // output
5116 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5117 }
5118
5119 };
5120
5121 class StatusWord {
5122 public:
5123 int32_t _value;
5124
5125 bool busy() const { return ((_value >> 15) & 1) != 0; }
5126 bool C3() const { return ((_value >> 14) & 1) != 0; }
5127 bool C2() const { return ((_value >> 10) & 1) != 0; }
5128 bool C1() const { return ((_value >> 9) & 1) != 0; }
5129 bool C0() const { return ((_value >> 8) & 1) != 0; }
5130 int top() const { return (_value >> 11) & 7 ; }
5131 bool error_status() const { return ((_value >> 7) & 1) != 0; }
5132 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
5133 bool precision() const { return ((_value >> 5) & 1) != 0; }
5134 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5135 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5136 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5137 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5138 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5139
5140 void print() const {
5141 // condition codes
5142 char c[5];
5143 c[0] = (C3()) ? '3' : '-';
5144 c[1] = (C2()) ? '2' : '-';
5145 c[2] = (C1()) ? '1' : '-';
5146 c[3] = (C0()) ? '0' : '-';
5147 c[4] = '\x0';
5148 // flags
5149 char f[9];
5150 f[0] = (error_status()) ? 'E' : '-';
5151 f[1] = (stack_fault ()) ? 'S' : '-';
5152 f[2] = (precision ()) ? 'P' : '-';
5153 f[3] = (underflow ()) ? 'U' : '-';
5154 f[4] = (overflow ()) ? 'O' : '-';
5155 f[5] = (zero_divide ()) ? 'Z' : '-';
5156 f[6] = (denormalized()) ? 'D' : '-';
5157 f[7] = (invalid ()) ? 'I' : '-';
5158 f[8] = '\x0';
5159 // output
5160 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
5161 }
5162
5163 };
5164
5165 class TagWord {
5166 public:
5167 int32_t _value;
5168
5169 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
5170
5171 void print() const {
5172 printf("%04x", _value & 0xFFFF);
5173 }
5174
5175 };
5176
5177 class FPU_Register {
5178 public:
5179 int32_t _m0;
5180 int32_t _m1;
5181 int16_t _ex;
5182
5183 bool is_indefinite() const {
5184 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5185 }
5186
5187 void print() const {
5188 char sign = (_ex < 0) ? '-' : '+';
5189 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
5190 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
5191 };
5192
5193 };
5194
5195 class FPU_State {
5196 public:
5197 enum {
5198 register_size = 10,
5199 number_of_registers = 8,
5200 register_mask = 7
5201 };
5202
5203 ControlWord _control_word;
5204 StatusWord _status_word;
5205 TagWord _tag_word;
5206 int32_t _error_offset;
5207 int32_t _error_selector;
5208 int32_t _data_offset;
5209 int32_t _data_selector;
5210 int8_t _register[register_size * number_of_registers];
5211
5212 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5213 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
5214
5215 const char* tag_as_string(int tag) const {
5216 switch (tag) {
5217 case 0: return "valid";
5218 case 1: return "zero";
5219 case 2: return "special";
5220 case 3: return "empty";
5221 }
5222 ShouldNotReachHere();
5223 return nullptr;
5224 }
5225
5226 void print() const {
5227 // print computation registers
5228 { int t = _status_word.top();
5229 for (int i = 0; i < number_of_registers; i++) {
5230 int j = (i - t) & register_mask;
5231 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5232 st(j)->print();
5233 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5234 }
5235 }
5236 printf("\n");
5237 // print control registers
5238 printf("ctrl = "); _control_word.print(); printf("\n");
5239 printf("stat = "); _status_word .print(); printf("\n");
5240 printf("tags = "); _tag_word .print(); printf("\n");
5241 }
5242
5243 };
5244
5245 class Flag_Register {
5246 public:
5247 int32_t _value;
5248
5249 bool overflow() const { return ((_value >> 11) & 1) != 0; }
5250 bool direction() const { return ((_value >> 10) & 1) != 0; }
5251 bool sign() const { return ((_value >> 7) & 1) != 0; }
5252 bool zero() const { return ((_value >> 6) & 1) != 0; }
5253 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
5254 bool parity() const { return ((_value >> 2) & 1) != 0; }
5255 bool carry() const { return ((_value >> 0) & 1) != 0; }
5256
5257 void print() const {
5258 // flags
5259 char f[8];
5260 f[0] = (overflow ()) ? 'O' : '-';
5261 f[1] = (direction ()) ? 'D' : '-';
5262 f[2] = (sign ()) ? 'S' : '-';
5263 f[3] = (zero ()) ? 'Z' : '-';
5264 f[4] = (auxiliary_carry()) ? 'A' : '-';
5265 f[5] = (parity ()) ? 'P' : '-';
5266 f[6] = (carry ()) ? 'C' : '-';
5267 f[7] = '\x0';
5268 // output
5269 printf("%08x flags = %s", _value, f);
5270 }
5271
5272 };
5273
5274 class IU_Register {
5275 public:
5276 int32_t _value;
5277
5278 void print() const {
5279 printf("%08x %11d", _value, _value);
5280 }
5281
5282 };
5283
5284 class IU_State {
5285 public:
5286 Flag_Register _eflags;
5287 IU_Register _rdi;
5288 IU_Register _rsi;
5289 IU_Register _rbp;
5290 IU_Register _rsp;
5291 IU_Register _rbx;
5292 IU_Register _rdx;
5293 IU_Register _rcx;
5294 IU_Register _rax;
5295
5296 void print() const {
5297 // computation registers
5298 printf("rax, = "); _rax.print(); printf("\n");
5299 printf("rbx, = "); _rbx.print(); printf("\n");
5300 printf("rcx = "); _rcx.print(); printf("\n");
5301 printf("rdx = "); _rdx.print(); printf("\n");
5302 printf("rdi = "); _rdi.print(); printf("\n");
5303 printf("rsi = "); _rsi.print(); printf("\n");
5304 printf("rbp, = "); _rbp.print(); printf("\n");
5305 printf("rsp = "); _rsp.print(); printf("\n");
5306 printf("\n");
5307 // control registers
5308 printf("flgs = "); _eflags.print(); printf("\n");
5309 }
5310 };
5311
5312
5313 class CPU_State {
5314 public:
5315 FPU_State _fpu_state;
5316 IU_State _iu_state;
5317
5318 void print() const {
5319 printf("--------------------------------------------------\n");
5320 _iu_state .print();
5321 printf("\n");
5322 _fpu_state.print();
5323 printf("--------------------------------------------------\n");
5324 }
5325
5326 };
5327
5328
5329 static void _print_CPU_state(CPU_State* state) {
5330 state->print();
5331 };
5332
5333
5334 void MacroAssembler::print_CPU_state() {
5335 push_CPU_state();
5336 push(rsp); // pass CPU state
5337 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5338 addptr(rsp, wordSize); // discard argument
5339 pop_CPU_state();
5340 }
5341
5342 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5343 // Either restore the MXCSR register after returning from the JNI Call
5344 // or verify that it wasn't changed (with -Xcheck:jni flag).
5345 if (VM_Version::supports_sse()) {
5346 if (RestoreMXCSROnJNICalls) {
5347 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5348 } else if (CheckJNICalls) {
5349 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5350 }
5351 }
5352 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5353 vzeroupper();
5354 }
5355
5356 // ((OopHandle)result).resolve();
5357 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5358 assert_different_registers(result, tmp);
5359
5360 // Only 64 bit platforms support GCs that require a tmp register
5361 // Only IN_HEAP loads require a thread_tmp register
5362 // OopHandle::resolve is an indirection like jobject.
5363 access_load_at(T_OBJECT, IN_NATIVE,
5364 result, Address(result, 0), tmp);
5365 }
5366
5367 // ((WeakHandle)result).resolve();
5368 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5369 assert_different_registers(rresult, rtmp);
5370 Label resolved;
5371
5372 // A null weak handle resolves to null.
5373 cmpptr(rresult, 0);
5374 jcc(Assembler::equal, resolved);
5375
5376 // Only 64 bit platforms support GCs that require a tmp register
5377 // Only IN_HEAP loads require a thread_tmp register
5378 // WeakHandle::resolve is an indirection like jweak.
5379 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5380 rresult, Address(rresult, 0), rtmp);
5381 bind(resolved);
5382 }
5383
5384 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5385 // get mirror
5386 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5387 load_method_holder(mirror, method);
5388 movptr(mirror, Address(mirror, mirror_offset));
5389 resolve_oop_handle(mirror, tmp);
5390 }
5391
5392 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5393 load_method_holder(rresult, rmethod);
5394 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5395 }
5396
5397 void MacroAssembler::load_method_holder(Register holder, Register method) {
5398 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
5399 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5400 movptr(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5401 }
5402
5403 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
5404 assert(UseCompactObjectHeaders, "expect compact object headers");
5405 movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
5406 shrq(dst, markWord::klass_shift);
5407 }
5408
5409 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5410 assert_different_registers(src, tmp);
5411 assert_different_registers(dst, tmp);
5412
5413 if (UseCompactObjectHeaders) {
5414 load_narrow_klass_compact(dst, src);
5415 decode_klass_not_null(dst, tmp);
5416 } else {
5417 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5418 decode_klass_not_null(dst, tmp);
5419 }
5420 }
5421
5422 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5423 assert(!UseCompactObjectHeaders, "not with compact headers");
5424 assert_different_registers(src, tmp);
5425 assert_different_registers(dst, tmp);
5426 encode_klass_not_null(src, tmp);
5427 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5428 }
5429
5430 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) {
5431 if (UseCompactObjectHeaders) {
5432 assert(tmp != noreg, "need tmp");
5433 assert_different_registers(klass, obj, tmp);
5434 load_narrow_klass_compact(tmp, obj);
5435 cmpl(klass, tmp);
5436 } else {
5437 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
5438 }
5439 }
5440
5441 void MacroAssembler::cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2) {
5442 if (UseCompactObjectHeaders) {
5443 assert(tmp2 != noreg, "need tmp2");
5444 assert_different_registers(obj1, obj2, tmp1, tmp2);
5445 load_narrow_klass_compact(tmp1, obj1);
5446 load_narrow_klass_compact(tmp2, obj2);
5447 cmpl(tmp1, tmp2);
5448 } else {
5449 movl(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes()));
5450 cmpl(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes()));
5451 }
5452 }
5453
5454 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5455 Register tmp1) {
5456 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5457 decorators = AccessInternal::decorator_fixup(decorators, type);
5458 bool as_raw = (decorators & AS_RAW) != 0;
5459 if (as_raw) {
5460 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1);
5461 } else {
5462 bs->load_at(this, decorators, type, dst, src, tmp1);
5463 }
5464 }
5465
5466 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5467 Register tmp1, Register tmp2, Register tmp3) {
5468 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5469 decorators = AccessInternal::decorator_fixup(decorators, type);
5470 bool as_raw = (decorators & AS_RAW) != 0;
5471 if (as_raw) {
5472 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5473 } else {
5474 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5475 }
5476 }
5477
5478 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5479 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1);
5480 }
5481
5482 // Doesn't do verification, generates fixed size code
5483 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5484 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1);
5485 }
5486
5487 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5488 Register tmp2, Register tmp3, DecoratorSet decorators) {
5489 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5490 }
5491
5492 // Used for storing nulls.
5493 void MacroAssembler::store_heap_oop_null(Address dst) {
5494 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5495 }
5496
5497 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5498 assert(!UseCompactObjectHeaders, "Don't use with compact headers");
5499 // Store to klass gap in destination
5500 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5501 }
5502
5503 #ifdef ASSERT
5504 void MacroAssembler::verify_heapbase(const char* msg) {
5505 assert (UseCompressedOops, "should be compressed");
5506 assert (Universe::heap() != nullptr, "java heap should be initialized");
5507 if (CheckCompressedOops) {
5508 Label ok;
5509 ExternalAddress src2(CompressedOops::base_addr());
5510 const bool is_src2_reachable = reachable(src2);
5511 if (!is_src2_reachable) {
5512 push(rscratch1); // cmpptr trashes rscratch1
5513 }
5514 cmpptr(r12_heapbase, src2, rscratch1);
5515 jcc(Assembler::equal, ok);
5516 STOP(msg);
5517 bind(ok);
5518 if (!is_src2_reachable) {
5519 pop(rscratch1);
5520 }
5521 }
5522 }
5523 #endif
5524
5525 // Algorithm must match oop.inline.hpp encode_heap_oop.
5526 void MacroAssembler::encode_heap_oop(Register r) {
5527 #ifdef ASSERT
5528 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5529 #endif
5530 verify_oop_msg(r, "broken oop in encode_heap_oop");
5531 if (CompressedOops::base() == nullptr) {
5532 if (CompressedOops::shift() != 0) {
5533 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5534 shrq(r, LogMinObjAlignmentInBytes);
5535 }
5536 return;
5537 }
5538 testq(r, r);
5539 cmovq(Assembler::equal, r, r12_heapbase);
5540 subq(r, r12_heapbase);
5541 shrq(r, LogMinObjAlignmentInBytes);
5542 }
5543
5544 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5545 #ifdef ASSERT
5546 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5547 if (CheckCompressedOops) {
5548 Label ok;
5549 testq(r, r);
5550 jcc(Assembler::notEqual, ok);
5551 STOP("null oop passed to encode_heap_oop_not_null");
5552 bind(ok);
5553 }
5554 #endif
5555 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5556 if (CompressedOops::base() != nullptr) {
5557 subq(r, r12_heapbase);
5558 }
5559 if (CompressedOops::shift() != 0) {
5560 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5561 shrq(r, LogMinObjAlignmentInBytes);
5562 }
5563 }
5564
5565 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5566 #ifdef ASSERT
5567 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5568 if (CheckCompressedOops) {
5569 Label ok;
5570 testq(src, src);
5571 jcc(Assembler::notEqual, ok);
5572 STOP("null oop passed to encode_heap_oop_not_null2");
5573 bind(ok);
5574 }
5575 #endif
5576 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5577 if (dst != src) {
5578 movq(dst, src);
5579 }
5580 if (CompressedOops::base() != nullptr) {
5581 subq(dst, r12_heapbase);
5582 }
5583 if (CompressedOops::shift() != 0) {
5584 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5585 shrq(dst, LogMinObjAlignmentInBytes);
5586 }
5587 }
5588
5589 void MacroAssembler::decode_heap_oop(Register r) {
5590 #ifdef ASSERT
5591 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5592 #endif
5593 if (CompressedOops::base() == nullptr) {
5594 if (CompressedOops::shift() != 0) {
5595 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5596 shlq(r, LogMinObjAlignmentInBytes);
5597 }
5598 } else {
5599 Label done;
5600 shlq(r, LogMinObjAlignmentInBytes);
5601 jccb(Assembler::equal, done);
5602 addq(r, r12_heapbase);
5603 bind(done);
5604 }
5605 verify_oop_msg(r, "broken oop in decode_heap_oop");
5606 }
5607
5608 void MacroAssembler::decode_heap_oop_not_null(Register r) {
5609 // Note: it will change flags
5610 assert (UseCompressedOops, "should only be used for compressed headers");
5611 assert (Universe::heap() != nullptr, "java heap should be initialized");
5612 // Cannot assert, unverified entry point counts instructions (see .ad file)
5613 // vtableStubs also counts instructions in pd_code_size_limit.
5614 // Also do not verify_oop as this is called by verify_oop.
5615 if (CompressedOops::shift() != 0) {
5616 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5617 shlq(r, LogMinObjAlignmentInBytes);
5618 if (CompressedOops::base() != nullptr) {
5619 addq(r, r12_heapbase);
5620 }
5621 } else {
5622 assert (CompressedOops::base() == nullptr, "sanity");
5623 }
5624 }
5625
5626 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5627 // Note: it will change flags
5628 assert (UseCompressedOops, "should only be used for compressed headers");
5629 assert (Universe::heap() != nullptr, "java heap should be initialized");
5630 // Cannot assert, unverified entry point counts instructions (see .ad file)
5631 // vtableStubs also counts instructions in pd_code_size_limit.
5632 // Also do not verify_oop as this is called by verify_oop.
5633 if (CompressedOops::shift() != 0) {
5634 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5635 if (LogMinObjAlignmentInBytes == Address::times_8) {
5636 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5637 } else {
5638 if (dst != src) {
5639 movq(dst, src);
5640 }
5641 shlq(dst, LogMinObjAlignmentInBytes);
5642 if (CompressedOops::base() != nullptr) {
5643 addq(dst, r12_heapbase);
5644 }
5645 }
5646 } else {
5647 assert (CompressedOops::base() == nullptr, "sanity");
5648 if (dst != src) {
5649 movq(dst, src);
5650 }
5651 }
5652 }
5653
5654 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5655 BLOCK_COMMENT("encode_klass_not_null {");
5656 assert_different_registers(r, tmp);
5657 if (CompressedKlassPointers::base() != nullptr) {
5658 if (AOTCodeCache::is_on_for_dump()) {
5659 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5660 } else {
5661 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5662 }
5663 subq(r, tmp);
5664 }
5665 if (CompressedKlassPointers::shift() != 0) {
5666 shrq(r, CompressedKlassPointers::shift());
5667 }
5668 BLOCK_COMMENT("} encode_klass_not_null");
5669 }
5670
5671 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5672 BLOCK_COMMENT("encode_and_move_klass_not_null {");
5673 assert_different_registers(src, dst);
5674 if (CompressedKlassPointers::base() != nullptr) {
5675 movptr(dst, -(intptr_t)CompressedKlassPointers::base());
5676 addq(dst, src);
5677 } else {
5678 movptr(dst, src);
5679 }
5680 if (CompressedKlassPointers::shift() != 0) {
5681 shrq(dst, CompressedKlassPointers::shift());
5682 }
5683 BLOCK_COMMENT("} encode_and_move_klass_not_null");
5684 }
5685
5686 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5687 BLOCK_COMMENT("decode_klass_not_null {");
5688 assert_different_registers(r, tmp);
5689 // Note: it will change flags
5690 // Cannot assert, unverified entry point counts instructions (see .ad file)
5691 // vtableStubs also counts instructions in pd_code_size_limit.
5692 // Also do not verify_oop as this is called by verify_oop.
5693 if (CompressedKlassPointers::shift() != 0) {
5694 shlq(r, CompressedKlassPointers::shift());
5695 }
5696 if (CompressedKlassPointers::base() != nullptr) {
5697 if (AOTCodeCache::is_on_for_dump()) {
5698 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5699 } else {
5700 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5701 }
5702 addq(r, tmp);
5703 }
5704 BLOCK_COMMENT("} decode_klass_not_null");
5705 }
5706
5707 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5708 BLOCK_COMMENT("decode_and_move_klass_not_null {");
5709 assert_different_registers(src, dst);
5710 // Note: it will change flags
5711 // Cannot assert, unverified entry point counts instructions (see .ad file)
5712 // vtableStubs also counts instructions in pd_code_size_limit.
5713 // Also do not verify_oop as this is called by verify_oop.
5714
5715 if (CompressedKlassPointers::base() == nullptr &&
5716 CompressedKlassPointers::shift() == 0) {
5717 // The best case scenario is that there is no base or shift. Then it is already
5718 // a pointer that needs nothing but a register rename.
5719 movl(dst, src);
5720 } else {
5721 if (CompressedKlassPointers::shift() <= Address::times_8) {
5722 if (CompressedKlassPointers::base() != nullptr) {
5723 movptr(dst, (intptr_t)CompressedKlassPointers::base());
5724 } else {
5725 xorq(dst, dst);
5726 }
5727 if (CompressedKlassPointers::shift() != 0) {
5728 assert(CompressedKlassPointers::shift() == Address::times_8, "klass not aligned on 64bits?");
5729 leaq(dst, Address(dst, src, Address::times_8, 0));
5730 } else {
5731 addq(dst, src);
5732 }
5733 } else {
5734 if (CompressedKlassPointers::base() != nullptr) {
5735 const intptr_t base_right_shifted =
5736 (intptr_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
5737 movptr(dst, base_right_shifted);
5738 } else {
5739 xorq(dst, dst);
5740 }
5741 addq(dst, src);
5742 shlq(dst, CompressedKlassPointers::shift());
5743 }
5744 }
5745 BLOCK_COMMENT("} decode_and_move_klass_not_null");
5746 }
5747
5748 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5749 assert (UseCompressedOops, "should only be used for compressed headers");
5750 assert (Universe::heap() != nullptr, "java heap should be initialized");
5751 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5752 int oop_index = oop_recorder()->find_index(obj);
5753 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5754 mov_narrow_oop(dst, oop_index, rspec);
5755 }
5756
5757 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5758 assert (UseCompressedOops, "should only be used for compressed headers");
5759 assert (Universe::heap() != nullptr, "java heap should be initialized");
5760 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5761 int oop_index = oop_recorder()->find_index(obj);
5762 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5763 mov_narrow_oop(dst, oop_index, rspec);
5764 }
5765
5766 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5767 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5768 int klass_index = oop_recorder()->find_index(k);
5769 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5770 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5771 }
5772
5773 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5774 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5775 int klass_index = oop_recorder()->find_index(k);
5776 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5777 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5778 }
5779
5780 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5781 assert (UseCompressedOops, "should only be used for compressed headers");
5782 assert (Universe::heap() != nullptr, "java heap should be initialized");
5783 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5784 int oop_index = oop_recorder()->find_index(obj);
5785 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5786 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5787 }
5788
5789 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5790 assert (UseCompressedOops, "should only be used for compressed headers");
5791 assert (Universe::heap() != nullptr, "java heap should be initialized");
5792 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5793 int oop_index = oop_recorder()->find_index(obj);
5794 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5795 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5796 }
5797
5798 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5799 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5800 int klass_index = oop_recorder()->find_index(k);
5801 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5802 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5803 }
5804
5805 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5806 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5807 int klass_index = oop_recorder()->find_index(k);
5808 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5809 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5810 }
5811
5812 void MacroAssembler::reinit_heapbase() {
5813 if (UseCompressedOops) {
5814 if (Universe::heap() != nullptr) {
5815 if (CompressedOops::base() == nullptr) {
5816 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5817 } else {
5818 mov64(r12_heapbase, (int64_t)CompressedOops::base());
5819 }
5820 } else {
5821 movptr(r12_heapbase, ExternalAddress(CompressedOops::base_addr()));
5822 }
5823 }
5824 }
5825
5826 #if COMPILER2_OR_JVMCI
5827
5828 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5829 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5830 // cnt - number of qwords (8-byte words).
5831 // base - start address, qword aligned.
5832 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5833 bool use64byteVector = (MaxVectorSize == 64) && (CopyAVX3Threshold == 0);
5834 if (use64byteVector) {
5835 vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5836 } else if (MaxVectorSize >= 32) {
5837 vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5838 } else {
5839 pxor(xtmp, xtmp);
5840 }
5841 jmp(L_zero_64_bytes);
5842
5843 BIND(L_loop);
5844 if (MaxVectorSize >= 32) {
5845 fill64(base, 0, xtmp, use64byteVector);
5846 } else {
5847 movdqu(Address(base, 0), xtmp);
5848 movdqu(Address(base, 16), xtmp);
5849 movdqu(Address(base, 32), xtmp);
5850 movdqu(Address(base, 48), xtmp);
5851 }
5852 addptr(base, 64);
5853
5854 BIND(L_zero_64_bytes);
5855 subptr(cnt, 8);
5856 jccb(Assembler::greaterEqual, L_loop);
5857
5858 // Copy trailing 64 bytes
5859 if (use64byteVector) {
5860 addptr(cnt, 8);
5861 jccb(Assembler::equal, L_end);
5862 fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5863 jmp(L_end);
5864 } else {
5865 addptr(cnt, 4);
5866 jccb(Assembler::less, L_tail);
5867 if (MaxVectorSize >= 32) {
5868 vmovdqu(Address(base, 0), xtmp);
5869 } else {
5870 movdqu(Address(base, 0), xtmp);
5871 movdqu(Address(base, 16), xtmp);
5872 }
5873 }
5874 addptr(base, 32);
5875 subptr(cnt, 4);
5876
5877 BIND(L_tail);
5878 addptr(cnt, 4);
5879 jccb(Assembler::lessEqual, L_end);
5880 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5881 fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5882 } else {
5883 decrement(cnt);
5884
5885 BIND(L_sloop);
5886 movq(Address(base, 0), xtmp);
5887 addptr(base, 8);
5888 decrement(cnt);
5889 jccb(Assembler::greaterEqual, L_sloop);
5890 }
5891 BIND(L_end);
5892 }
5893
5894 // Clearing constant sized memory using YMM/ZMM registers.
5895 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5896 assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
5897 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
5898
5899 int vector64_count = (cnt & (~0x7)) >> 3;
5900 cnt = cnt & 0x7;
5901 const int fill64_per_loop = 4;
5902 const int max_unrolled_fill64 = 8;
5903
5904 // 64 byte initialization loop.
5905 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5906 int start64 = 0;
5907 if (vector64_count > max_unrolled_fill64) {
5908 Label LOOP;
5909 Register index = rtmp;
5910
5911 start64 = vector64_count - (vector64_count % fill64_per_loop);
5912
5913 movl(index, 0);
5914 BIND(LOOP);
5915 for (int i = 0; i < fill64_per_loop; i++) {
5916 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5917 }
5918 addl(index, fill64_per_loop * 64);
5919 cmpl(index, start64 * 64);
5920 jccb(Assembler::less, LOOP);
5921 }
5922 for (int i = start64; i < vector64_count; i++) {
5923 fill64(base, i * 64, xtmp, use64byteVector);
5924 }
5925
5926 // Clear remaining 64 byte tail.
5927 int disp = vector64_count * 64;
5928 if (cnt) {
5929 switch (cnt) {
5930 case 1:
5931 movq(Address(base, disp), xtmp);
5932 break;
5933 case 2:
5934 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
5935 break;
5936 case 3:
5937 movl(rtmp, 0x7);
5938 kmovwl(mask, rtmp);
5939 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
5940 break;
5941 case 4:
5942 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5943 break;
5944 case 5:
5945 if (use64byteVector) {
5946 movl(rtmp, 0x1F);
5947 kmovwl(mask, rtmp);
5948 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5949 } else {
5950 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5951 movq(Address(base, disp + 32), xtmp);
5952 }
5953 break;
5954 case 6:
5955 if (use64byteVector) {
5956 movl(rtmp, 0x3F);
5957 kmovwl(mask, rtmp);
5958 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5959 } else {
5960 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5961 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
5962 }
5963 break;
5964 case 7:
5965 if (use64byteVector) {
5966 movl(rtmp, 0x7F);
5967 kmovwl(mask, rtmp);
5968 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5969 } else {
5970 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5971 movl(rtmp, 0x7);
5972 kmovwl(mask, rtmp);
5973 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
5974 }
5975 break;
5976 default:
5977 fatal("Unexpected length : %d\n",cnt);
5978 break;
5979 }
5980 }
5981 }
5982
5983 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5984 bool is_large, KRegister mask) {
5985 // cnt - number of qwords (8-byte words).
5986 // base - start address, qword aligned.
5987 // is_large - if optimizers know cnt is larger than InitArrayShortSize
5988 assert(base==rdi, "base register must be edi for rep stos");
5989 assert(tmp==rax, "tmp register must be eax for rep stos");
5990 assert(cnt==rcx, "cnt register must be ecx for rep stos");
5991 assert(InitArrayShortSize % BytesPerLong == 0,
5992 "InitArrayShortSize should be the multiple of BytesPerLong");
5993
5994 Label DONE;
5995 if (!is_large || !UseXMMForObjInit) {
5996 xorptr(tmp, tmp);
5997 }
5998
5999 if (!is_large) {
6000 Label LOOP, LONG;
6001 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6002 jccb(Assembler::greater, LONG);
6003
6004 decrement(cnt);
6005 jccb(Assembler::negative, DONE); // Zero length
6006
6007 // Use individual pointer-sized stores for small counts:
6008 BIND(LOOP);
6009 movptr(Address(base, cnt, Address::times_ptr), tmp);
6010 decrement(cnt);
6011 jccb(Assembler::greaterEqual, LOOP);
6012 jmpb(DONE);
6013
6014 BIND(LONG);
6015 }
6016
6017 // Use longer rep-prefixed ops for non-small counts:
6018 if (UseFastStosb) {
6019 shlptr(cnt, 3); // convert to number of bytes
6020 rep_stosb();
6021 } else if (UseXMMForObjInit) {
6022 xmm_clear_mem(base, cnt, tmp, xtmp, mask);
6023 } else {
6024 rep_stos();
6025 }
6026
6027 BIND(DONE);
6028 }
6029
6030 #endif //COMPILER2_OR_JVMCI
6031
6032
6033 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6034 Register to, Register value, Register count,
6035 Register rtmp, XMMRegister xtmp) {
6036 ShortBranchVerifier sbv(this);
6037 assert_different_registers(to, value, count, rtmp);
6038 Label L_exit;
6039 Label L_fill_2_bytes, L_fill_4_bytes;
6040
6041 #if defined(COMPILER2)
6042 if(MaxVectorSize >=32 &&
6043 VM_Version::supports_avx512vlbw() &&
6044 VM_Version::supports_bmi2()) {
6045 generate_fill_avx3(t, to, value, count, rtmp, xtmp);
6046 return;
6047 }
6048 #endif
6049
6050 int shift = -1;
6051 switch (t) {
6052 case T_BYTE:
6053 shift = 2;
6054 break;
6055 case T_SHORT:
6056 shift = 1;
6057 break;
6058 case T_INT:
6059 shift = 0;
6060 break;
6061 default: ShouldNotReachHere();
6062 }
6063
6064 if (t == T_BYTE) {
6065 andl(value, 0xff);
6066 movl(rtmp, value);
6067 shll(rtmp, 8);
6068 orl(value, rtmp);
6069 }
6070 if (t == T_SHORT) {
6071 andl(value, 0xffff);
6072 }
6073 if (t == T_BYTE || t == T_SHORT) {
6074 movl(rtmp, value);
6075 shll(rtmp, 16);
6076 orl(value, rtmp);
6077 }
6078
6079 cmpptr(count, 8 << shift); // Short arrays (< 32 bytes) fill by element
6080 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6081 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6082 Label L_skip_align2;
6083 // align source address at 4 bytes address boundary
6084 if (t == T_BYTE) {
6085 Label L_skip_align1;
6086 // One byte misalignment happens only for byte arrays
6087 testptr(to, 1);
6088 jccb(Assembler::zero, L_skip_align1);
6089 movb(Address(to, 0), value);
6090 increment(to);
6091 decrement(count);
6092 BIND(L_skip_align1);
6093 }
6094 // Two bytes misalignment happens only for byte and short (char) arrays
6095 testptr(to, 2);
6096 jccb(Assembler::zero, L_skip_align2);
6097 movw(Address(to, 0), value);
6098 addptr(to, 2);
6099 subptr(count, 1<<(shift-1));
6100 BIND(L_skip_align2);
6101 }
6102 {
6103 Label L_fill_32_bytes;
6104 if (!UseUnalignedLoadStores) {
6105 // align to 8 bytes, we know we are 4 byte aligned to start
6106 testptr(to, 4);
6107 jccb(Assembler::zero, L_fill_32_bytes);
6108 movl(Address(to, 0), value);
6109 addptr(to, 4);
6110 subptr(count, 1<<shift);
6111 }
6112 BIND(L_fill_32_bytes);
6113 {
6114 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6115 movdl(xtmp, value);
6116 if (UseAVX >= 2 && UseUnalignedLoadStores) {
6117 Label L_check_fill_32_bytes;
6118 if (UseAVX > 2) {
6119 // Fill 64-byte chunks
6120 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
6121
6122 // If number of bytes to fill < CopyAVX3Threshold, perform fill using AVX2
6123 cmpptr(count, CopyAVX3Threshold);
6124 jccb(Assembler::below, L_check_fill_64_bytes_avx2);
6125
6126 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
6127
6128 subptr(count, 16 << shift);
6129 jcc(Assembler::less, L_check_fill_32_bytes);
6130 align(16);
6131
6132 BIND(L_fill_64_bytes_loop_avx3);
6133 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
6134 addptr(to, 64);
6135 subptr(count, 16 << shift);
6136 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
6137 jmpb(L_check_fill_32_bytes);
6138
6139 BIND(L_check_fill_64_bytes_avx2);
6140 }
6141 // Fill 64-byte chunks
6142 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
6143
6144 subptr(count, 16 << shift);
6145 jcc(Assembler::less, L_check_fill_32_bytes);
6146
6147 // align data for 64-byte chunks
6148 Label L_fill_64_bytes_loop, L_align_64_bytes_loop;
6149 if (EnableX86ECoreOpts) {
6150 // align 'big' arrays to cache lines to minimize split_stores
6151 cmpptr(count, 96 << shift);
6152 jcc(Assembler::below, L_fill_64_bytes_loop);
6153
6154 // Find the bytes needed for alignment
6155 movptr(rtmp, to);
6156 andptr(rtmp, 0x1c);
6157 jcc(Assembler::zero, L_fill_64_bytes_loop);
6158 negptr(rtmp); // number of bytes to fill 32-rtmp. it filled by 2 mov by 32
6159 addptr(rtmp, 32);
6160 shrptr(rtmp, 2 - shift);// get number of elements from bytes
6161 subptr(count, rtmp); // adjust count by number of elements
6162
6163 align(16);
6164 BIND(L_align_64_bytes_loop);
6165 movdl(Address(to, 0), xtmp);
6166 addptr(to, 4);
6167 subptr(rtmp, 1 << shift);
6168 jcc(Assembler::greater, L_align_64_bytes_loop);
6169 }
6170
6171 align(16);
6172 BIND(L_fill_64_bytes_loop);
6173 vmovdqu(Address(to, 0), xtmp);
6174 vmovdqu(Address(to, 32), xtmp);
6175 addptr(to, 64);
6176 subptr(count, 16 << shift);
6177 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6178
6179 align(16);
6180 BIND(L_check_fill_32_bytes);
6181 addptr(count, 8 << shift);
6182 jccb(Assembler::less, L_check_fill_8_bytes);
6183 vmovdqu(Address(to, 0), xtmp);
6184 addptr(to, 32);
6185 subptr(count, 8 << shift);
6186
6187 BIND(L_check_fill_8_bytes);
6188 // clean upper bits of YMM registers
6189 movdl(xtmp, value);
6190 pshufd(xtmp, xtmp, 0);
6191 } else {
6192 // Fill 32-byte chunks
6193 pshufd(xtmp, xtmp, 0);
6194
6195 subptr(count, 8 << shift);
6196 jcc(Assembler::less, L_check_fill_8_bytes);
6197 align(16);
6198
6199 BIND(L_fill_32_bytes_loop);
6200
6201 if (UseUnalignedLoadStores) {
6202 movdqu(Address(to, 0), xtmp);
6203 movdqu(Address(to, 16), xtmp);
6204 } else {
6205 movq(Address(to, 0), xtmp);
6206 movq(Address(to, 8), xtmp);
6207 movq(Address(to, 16), xtmp);
6208 movq(Address(to, 24), xtmp);
6209 }
6210
6211 addptr(to, 32);
6212 subptr(count, 8 << shift);
6213 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6214
6215 BIND(L_check_fill_8_bytes);
6216 }
6217 addptr(count, 8 << shift);
6218 jccb(Assembler::zero, L_exit);
6219 jmpb(L_fill_8_bytes);
6220
6221 //
6222 // length is too short, just fill qwords
6223 //
6224 align(16);
6225 BIND(L_fill_8_bytes_loop);
6226 movq(Address(to, 0), xtmp);
6227 addptr(to, 8);
6228 BIND(L_fill_8_bytes);
6229 subptr(count, 1 << (shift + 1));
6230 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6231 }
6232 }
6233
6234 Label L_fill_4_bytes_loop;
6235 testl(count, 1 << shift);
6236 jccb(Assembler::zero, L_fill_2_bytes);
6237
6238 align(16);
6239 BIND(L_fill_4_bytes_loop);
6240 movl(Address(to, 0), value);
6241 addptr(to, 4);
6242
6243 BIND(L_fill_4_bytes);
6244 subptr(count, 1 << shift);
6245 jccb(Assembler::greaterEqual, L_fill_4_bytes_loop);
6246
6247 if (t == T_BYTE || t == T_SHORT) {
6248 Label L_fill_byte;
6249 BIND(L_fill_2_bytes);
6250 // fill trailing 2 bytes
6251 testl(count, 1<<(shift-1));
6252 jccb(Assembler::zero, L_fill_byte);
6253 movw(Address(to, 0), value);
6254 if (t == T_BYTE) {
6255 addptr(to, 2);
6256 BIND(L_fill_byte);
6257 // fill trailing byte
6258 testl(count, 1);
6259 jccb(Assembler::zero, L_exit);
6260 movb(Address(to, 0), value);
6261 } else {
6262 BIND(L_fill_byte);
6263 }
6264 } else {
6265 BIND(L_fill_2_bytes);
6266 }
6267 BIND(L_exit);
6268 }
6269
6270 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
6271 switch(type) {
6272 case T_BYTE:
6273 case T_BOOLEAN:
6274 evpbroadcastb(dst, src, vector_len);
6275 break;
6276 case T_SHORT:
6277 case T_CHAR:
6278 evpbroadcastw(dst, src, vector_len);
6279 break;
6280 case T_INT:
6281 case T_FLOAT:
6282 evpbroadcastd(dst, src, vector_len);
6283 break;
6284 case T_LONG:
6285 case T_DOUBLE:
6286 evpbroadcastq(dst, src, vector_len);
6287 break;
6288 default:
6289 fatal("Unhandled type : %s", type2name(type));
6290 break;
6291 }
6292 }
6293
6294 // Encode given char[]/byte[] to byte[] in ISO_8859_1 or ASCII
6295 //
6296 // @IntrinsicCandidate
6297 // int sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(
6298 // char[] sa, int sp, byte[] da, int dp, int len) {
6299 // int i = 0;
6300 // for (; i < len; i++) {
6301 // char c = sa[sp++];
6302 // if (c > '\u00FF')
6303 // break;
6304 // da[dp++] = (byte) c;
6305 // }
6306 // return i;
6307 // }
6308 //
6309 // @IntrinsicCandidate
6310 // int java.lang.StringCoding.encodeISOArray0(
6311 // byte[] sa, int sp, byte[] da, int dp, int len) {
6312 // int i = 0;
6313 // for (; i < len; i++) {
6314 // char c = StringUTF16.getChar(sa, sp++);
6315 // if (c > '\u00FF')
6316 // break;
6317 // da[dp++] = (byte) c;
6318 // }
6319 // return i;
6320 // }
6321 //
6322 // @IntrinsicCandidate
6323 // int java.lang.StringCoding.encodeAsciiArray0(
6324 // char[] sa, int sp, byte[] da, int dp, int len) {
6325 // int i = 0;
6326 // for (; i < len; i++) {
6327 // char c = sa[sp++];
6328 // if (c >= '\u0080')
6329 // break;
6330 // da[dp++] = (byte) c;
6331 // }
6332 // return i;
6333 // }
6334 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6335 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6336 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6337 Register tmp5, Register result, bool ascii) {
6338
6339 // rsi: src
6340 // rdi: dst
6341 // rdx: len
6342 // rcx: tmp5
6343 // rax: result
6344 ShortBranchVerifier sbv(this);
6345 assert_different_registers(src, dst, len, tmp5, result);
6346 Label L_done, L_copy_1_char, L_copy_1_char_exit;
6347
6348 int mask = ascii ? 0xff80ff80 : 0xff00ff00;
6349 int short_mask = ascii ? 0xff80 : 0xff00;
6350
6351 // set result
6352 xorl(result, result);
6353 // check for zero length
6354 testl(len, len);
6355 jcc(Assembler::zero, L_done);
6356
6357 movl(result, len);
6358
6359 // Setup pointers
6360 lea(src, Address(src, len, Address::times_2)); // char[]
6361 lea(dst, Address(dst, len, Address::times_1)); // byte[]
6362 negptr(len);
6363
6364 if (UseSSE42Intrinsics || UseAVX >= 2) {
6365 Label L_copy_8_chars, L_copy_8_chars_exit;
6366 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6367
6368 if (UseAVX >= 2) {
6369 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6370 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6371 movdl(tmp1Reg, tmp5);
6372 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
6373 jmp(L_chars_32_check);
6374
6375 bind(L_copy_32_chars);
6376 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6377 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6378 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6379 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6380 jccb(Assembler::notZero, L_copy_32_chars_exit);
6381 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6382 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
6383 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6384
6385 bind(L_chars_32_check);
6386 addptr(len, 32);
6387 jcc(Assembler::lessEqual, L_copy_32_chars);
6388
6389 bind(L_copy_32_chars_exit);
6390 subptr(len, 16);
6391 jccb(Assembler::greater, L_copy_16_chars_exit);
6392
6393 } else if (UseSSE42Intrinsics) {
6394 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6395 movdl(tmp1Reg, tmp5);
6396 pshufd(tmp1Reg, tmp1Reg, 0);
6397 jmpb(L_chars_16_check);
6398 }
6399
6400 bind(L_copy_16_chars);
6401 if (UseAVX >= 2) {
6402 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6403 vptest(tmp2Reg, tmp1Reg);
6404 jcc(Assembler::notZero, L_copy_16_chars_exit);
6405 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
6406 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
6407 } else {
6408 if (UseAVX > 0) {
6409 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6410 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6411 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
6412 } else {
6413 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6414 por(tmp2Reg, tmp3Reg);
6415 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6416 por(tmp2Reg, tmp4Reg);
6417 }
6418 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6419 jccb(Assembler::notZero, L_copy_16_chars_exit);
6420 packuswb(tmp3Reg, tmp4Reg);
6421 }
6422 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6423
6424 bind(L_chars_16_check);
6425 addptr(len, 16);
6426 jcc(Assembler::lessEqual, L_copy_16_chars);
6427
6428 bind(L_copy_16_chars_exit);
6429 if (UseAVX >= 2) {
6430 // clean upper bits of YMM registers
6431 vpxor(tmp2Reg, tmp2Reg);
6432 vpxor(tmp3Reg, tmp3Reg);
6433 vpxor(tmp4Reg, tmp4Reg);
6434 movdl(tmp1Reg, tmp5);
6435 pshufd(tmp1Reg, tmp1Reg, 0);
6436 }
6437 subptr(len, 8);
6438 jccb(Assembler::greater, L_copy_8_chars_exit);
6439
6440 bind(L_copy_8_chars);
6441 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6442 ptest(tmp3Reg, tmp1Reg);
6443 jccb(Assembler::notZero, L_copy_8_chars_exit);
6444 packuswb(tmp3Reg, tmp1Reg);
6445 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6446 addptr(len, 8);
6447 jccb(Assembler::lessEqual, L_copy_8_chars);
6448
6449 bind(L_copy_8_chars_exit);
6450 subptr(len, 8);
6451 jccb(Assembler::zero, L_done);
6452 }
6453
6454 bind(L_copy_1_char);
6455 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6456 testl(tmp5, short_mask); // check if Unicode or non-ASCII char
6457 jccb(Assembler::notZero, L_copy_1_char_exit);
6458 movb(Address(dst, len, Address::times_1, 0), tmp5);
6459 addptr(len, 1);
6460 jccb(Assembler::less, L_copy_1_char);
6461
6462 bind(L_copy_1_char_exit);
6463 addptr(result, len); // len is negative count of not processed elements
6464
6465 bind(L_done);
6466 }
6467
6468 /**
6469 * Helper for multiply_to_len().
6470 */
6471 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6472 addq(dest_lo, src1);
6473 adcq(dest_hi, 0);
6474 addq(dest_lo, src2);
6475 adcq(dest_hi, 0);
6476 }
6477
6478 /**
6479 * Multiply 64 bit by 64 bit first loop.
6480 */
6481 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6482 Register y, Register y_idx, Register z,
6483 Register carry, Register product,
6484 Register idx, Register kdx) {
6485 //
6486 // jlong carry, x[], y[], z[];
6487 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6488 // huge_128 product = y[idx] * x[xstart] + carry;
6489 // z[kdx] = (jlong)product;
6490 // carry = (jlong)(product >>> 64);
6491 // }
6492 // z[xstart] = carry;
6493 //
6494
6495 Label L_first_loop, L_first_loop_exit;
6496 Label L_one_x, L_one_y, L_multiply;
6497
6498 decrementl(xstart);
6499 jcc(Assembler::negative, L_one_x);
6500
6501 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6502 rorq(x_xstart, 32); // convert big-endian to little-endian
6503
6504 bind(L_first_loop);
6505 decrementl(idx);
6506 jcc(Assembler::negative, L_first_loop_exit);
6507 decrementl(idx);
6508 jcc(Assembler::negative, L_one_y);
6509 movq(y_idx, Address(y, idx, Address::times_4, 0));
6510 rorq(y_idx, 32); // convert big-endian to little-endian
6511 bind(L_multiply);
6512 movq(product, x_xstart);
6513 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
6514 addq(product, carry);
6515 adcq(rdx, 0);
6516 subl(kdx, 2);
6517 movl(Address(z, kdx, Address::times_4, 4), product);
6518 shrq(product, 32);
6519 movl(Address(z, kdx, Address::times_4, 0), product);
6520 movq(carry, rdx);
6521 jmp(L_first_loop);
6522
6523 bind(L_one_y);
6524 movl(y_idx, Address(y, 0));
6525 jmp(L_multiply);
6526
6527 bind(L_one_x);
6528 movl(x_xstart, Address(x, 0));
6529 jmp(L_first_loop);
6530
6531 bind(L_first_loop_exit);
6532 }
6533
6534 /**
6535 * Multiply 64 bit by 64 bit and add 128 bit.
6536 */
6537 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
6538 Register yz_idx, Register idx,
6539 Register carry, Register product, int offset) {
6540 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
6541 // z[kdx] = (jlong)product;
6542
6543 movq(yz_idx, Address(y, idx, Address::times_4, offset));
6544 rorq(yz_idx, 32); // convert big-endian to little-endian
6545 movq(product, x_xstart);
6546 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6547 movq(yz_idx, Address(z, idx, Address::times_4, offset));
6548 rorq(yz_idx, 32); // convert big-endian to little-endian
6549
6550 add2_with_carry(rdx, product, carry, yz_idx);
6551
6552 movl(Address(z, idx, Address::times_4, offset+4), product);
6553 shrq(product, 32);
6554 movl(Address(z, idx, Address::times_4, offset), product);
6555
6556 }
6557
6558 /**
6559 * Multiply 128 bit by 128 bit. Unrolled inner loop.
6560 */
6561 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
6562 Register yz_idx, Register idx, Register jdx,
6563 Register carry, Register product,
6564 Register carry2) {
6565 // jlong carry, x[], y[], z[];
6566 // int kdx = ystart+1;
6567 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6568 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
6569 // z[kdx+idx+1] = (jlong)product;
6570 // jlong carry2 = (jlong)(product >>> 64);
6571 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6572 // z[kdx+idx] = (jlong)product;
6573 // carry = (jlong)(product >>> 64);
6574 // }
6575 // idx += 2;
6576 // if (idx > 0) {
6577 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6578 // z[kdx+idx] = (jlong)product;
6579 // carry = (jlong)(product >>> 64);
6580 // }
6581 //
6582
6583 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6584
6585 movl(jdx, idx);
6586 andl(jdx, 0xFFFFFFFC);
6587 shrl(jdx, 2);
6588
6589 bind(L_third_loop);
6590 subl(jdx, 1);
6591 jcc(Assembler::negative, L_third_loop_exit);
6592 subl(idx, 4);
6593
6594 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6595 movq(carry2, rdx);
6596
6597 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6598 movq(carry, rdx);
6599 jmp(L_third_loop);
6600
6601 bind (L_third_loop_exit);
6602
6603 andl (idx, 0x3);
6604 jcc(Assembler::zero, L_post_third_loop_done);
6605
6606 Label L_check_1;
6607 subl(idx, 2);
6608 jcc(Assembler::negative, L_check_1);
6609
6610 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6611 movq(carry, rdx);
6612
6613 bind (L_check_1);
6614 addl (idx, 0x2);
6615 andl (idx, 0x1);
6616 subl(idx, 1);
6617 jcc(Assembler::negative, L_post_third_loop_done);
6618
6619 movl(yz_idx, Address(y, idx, Address::times_4, 0));
6620 movq(product, x_xstart);
6621 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6622 movl(yz_idx, Address(z, idx, Address::times_4, 0));
6623
6624 add2_with_carry(rdx, product, yz_idx, carry);
6625
6626 movl(Address(z, idx, Address::times_4, 0), product);
6627 shrq(product, 32);
6628
6629 shlq(rdx, 32);
6630 orq(product, rdx);
6631 movq(carry, product);
6632
6633 bind(L_post_third_loop_done);
6634 }
6635
6636 /**
6637 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6638 *
6639 */
6640 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6641 Register carry, Register carry2,
6642 Register idx, Register jdx,
6643 Register yz_idx1, Register yz_idx2,
6644 Register tmp, Register tmp3, Register tmp4) {
6645 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6646
6647 // jlong carry, x[], y[], z[];
6648 // int kdx = ystart+1;
6649 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6650 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6651 // jlong carry2 = (jlong)(tmp3 >>> 64);
6652 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
6653 // carry = (jlong)(tmp4 >>> 64);
6654 // z[kdx+idx+1] = (jlong)tmp3;
6655 // z[kdx+idx] = (jlong)tmp4;
6656 // }
6657 // idx += 2;
6658 // if (idx > 0) {
6659 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6660 // z[kdx+idx] = (jlong)yz_idx1;
6661 // carry = (jlong)(yz_idx1 >>> 64);
6662 // }
6663 //
6664
6665 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6666
6667 movl(jdx, idx);
6668 andl(jdx, 0xFFFFFFFC);
6669 shrl(jdx, 2);
6670
6671 bind(L_third_loop);
6672 subl(jdx, 1);
6673 jcc(Assembler::negative, L_third_loop_exit);
6674 subl(idx, 4);
6675
6676 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
6677 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6678 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
6679 rorxq(yz_idx2, yz_idx2, 32);
6680
6681 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6682 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
6683
6684 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
6685 rorxq(yz_idx1, yz_idx1, 32);
6686 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6687 rorxq(yz_idx2, yz_idx2, 32);
6688
6689 if (VM_Version::supports_adx()) {
6690 adcxq(tmp3, carry);
6691 adoxq(tmp3, yz_idx1);
6692
6693 adcxq(tmp4, tmp);
6694 adoxq(tmp4, yz_idx2);
6695
6696 movl(carry, 0); // does not affect flags
6697 adcxq(carry2, carry);
6698 adoxq(carry2, carry);
6699 } else {
6700 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6701 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6702 }
6703 movq(carry, carry2);
6704
6705 movl(Address(z, idx, Address::times_4, 12), tmp3);
6706 shrq(tmp3, 32);
6707 movl(Address(z, idx, Address::times_4, 8), tmp3);
6708
6709 movl(Address(z, idx, Address::times_4, 4), tmp4);
6710 shrq(tmp4, 32);
6711 movl(Address(z, idx, Address::times_4, 0), tmp4);
6712
6713 jmp(L_third_loop);
6714
6715 bind (L_third_loop_exit);
6716
6717 andl (idx, 0x3);
6718 jcc(Assembler::zero, L_post_third_loop_done);
6719
6720 Label L_check_1;
6721 subl(idx, 2);
6722 jcc(Assembler::negative, L_check_1);
6723
6724 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
6725 rorxq(yz_idx1, yz_idx1, 32);
6726 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6727 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6728 rorxq(yz_idx2, yz_idx2, 32);
6729
6730 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6731
6732 movl(Address(z, idx, Address::times_4, 4), tmp3);
6733 shrq(tmp3, 32);
6734 movl(Address(z, idx, Address::times_4, 0), tmp3);
6735 movq(carry, tmp4);
6736
6737 bind (L_check_1);
6738 addl (idx, 0x2);
6739 andl (idx, 0x1);
6740 subl(idx, 1);
6741 jcc(Assembler::negative, L_post_third_loop_done);
6742 movl(tmp4, Address(y, idx, Address::times_4, 0));
6743 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
6744 movl(tmp4, Address(z, idx, Address::times_4, 0));
6745
6746 add2_with_carry(carry2, tmp3, tmp4, carry);
6747
6748 movl(Address(z, idx, Address::times_4, 0), tmp3);
6749 shrq(tmp3, 32);
6750
6751 shlq(carry2, 32);
6752 orq(tmp3, carry2);
6753 movq(carry, tmp3);
6754
6755 bind(L_post_third_loop_done);
6756 }
6757
6758 /**
6759 * Code for BigInteger::multiplyToLen() intrinsic.
6760 *
6761 * rdi: x
6762 * rax: xlen
6763 * rsi: y
6764 * rcx: ylen
6765 * r8: z
6766 * r11: tmp0
6767 * r12: tmp1
6768 * r13: tmp2
6769 * r14: tmp3
6770 * r15: tmp4
6771 * rbx: tmp5
6772 *
6773 */
6774 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
6775 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6776 ShortBranchVerifier sbv(this);
6777 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6778
6779 push(tmp0);
6780 push(tmp1);
6781 push(tmp2);
6782 push(tmp3);
6783 push(tmp4);
6784 push(tmp5);
6785
6786 push(xlen);
6787
6788 const Register idx = tmp1;
6789 const Register kdx = tmp2;
6790 const Register xstart = tmp3;
6791
6792 const Register y_idx = tmp4;
6793 const Register carry = tmp5;
6794 const Register product = xlen;
6795 const Register x_xstart = tmp0;
6796
6797 // First Loop.
6798 //
6799 // final static long LONG_MASK = 0xffffffffL;
6800 // int xstart = xlen - 1;
6801 // int ystart = ylen - 1;
6802 // long carry = 0;
6803 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6804 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6805 // z[kdx] = (int)product;
6806 // carry = product >>> 32;
6807 // }
6808 // z[xstart] = (int)carry;
6809 //
6810
6811 movl(idx, ylen); // idx = ylen;
6812 lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen;
6813 xorq(carry, carry); // carry = 0;
6814
6815 Label L_done;
6816
6817 movl(xstart, xlen);
6818 decrementl(xstart);
6819 jcc(Assembler::negative, L_done);
6820
6821 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6822
6823 Label L_second_loop;
6824 testl(kdx, kdx);
6825 jcc(Assembler::zero, L_second_loop);
6826
6827 Label L_carry;
6828 subl(kdx, 1);
6829 jcc(Assembler::zero, L_carry);
6830
6831 movl(Address(z, kdx, Address::times_4, 0), carry);
6832 shrq(carry, 32);
6833 subl(kdx, 1);
6834
6835 bind(L_carry);
6836 movl(Address(z, kdx, Address::times_4, 0), carry);
6837
6838 // Second and third (nested) loops.
6839 //
6840 // for (int i = xstart-1; i >= 0; i--) { // Second loop
6841 // carry = 0;
6842 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6843 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6844 // (z[k] & LONG_MASK) + carry;
6845 // z[k] = (int)product;
6846 // carry = product >>> 32;
6847 // }
6848 // z[i] = (int)carry;
6849 // }
6850 //
6851 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6852
6853 const Register jdx = tmp1;
6854
6855 bind(L_second_loop);
6856 xorl(carry, carry); // carry = 0;
6857 movl(jdx, ylen); // j = ystart+1
6858
6859 subl(xstart, 1); // i = xstart-1;
6860 jcc(Assembler::negative, L_done);
6861
6862 push (z);
6863
6864 Label L_last_x;
6865 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6866 subl(xstart, 1); // i = xstart-1;
6867 jcc(Assembler::negative, L_last_x);
6868
6869 if (UseBMI2Instructions) {
6870 movq(rdx, Address(x, xstart, Address::times_4, 0));
6871 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6872 } else {
6873 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6874 rorq(x_xstart, 32); // convert big-endian to little-endian
6875 }
6876
6877 Label L_third_loop_prologue;
6878 bind(L_third_loop_prologue);
6879
6880 push (x);
6881 push (xstart);
6882 push (ylen);
6883
6884
6885 if (UseBMI2Instructions) {
6886 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6887 } else { // !UseBMI2Instructions
6888 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6889 }
6890
6891 pop(ylen);
6892 pop(xlen);
6893 pop(x);
6894 pop(z);
6895
6896 movl(tmp3, xlen);
6897 addl(tmp3, 1);
6898 movl(Address(z, tmp3, Address::times_4, 0), carry);
6899 subl(tmp3, 1);
6900 jccb(Assembler::negative, L_done);
6901
6902 shrq(carry, 32);
6903 movl(Address(z, tmp3, Address::times_4, 0), carry);
6904 jmp(L_second_loop);
6905
6906 // Next infrequent code is moved outside loops.
6907 bind(L_last_x);
6908 if (UseBMI2Instructions) {
6909 movl(rdx, Address(x, 0));
6910 } else {
6911 movl(x_xstart, Address(x, 0));
6912 }
6913 jmp(L_third_loop_prologue);
6914
6915 bind(L_done);
6916
6917 pop(xlen);
6918
6919 pop(tmp5);
6920 pop(tmp4);
6921 pop(tmp3);
6922 pop(tmp2);
6923 pop(tmp1);
6924 pop(tmp0);
6925 }
6926
6927 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6928 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6929 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6930 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6931 Label VECTOR8_TAIL, VECTOR4_TAIL;
6932 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6933 Label SAME_TILL_END, DONE;
6934 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6935
6936 //scale is in rcx in both Win64 and Unix
6937 ShortBranchVerifier sbv(this);
6938
6939 shlq(length);
6940 xorq(result, result);
6941
6942 if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6943 VM_Version::supports_avx512vlbw()) {
6944 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6945
6946 cmpq(length, 64);
6947 jcc(Assembler::less, VECTOR32_TAIL);
6948
6949 movq(tmp1, length);
6950 andq(tmp1, 0x3F); // tail count
6951 andq(length, ~(0x3F)); //vector count
6952
6953 bind(VECTOR64_LOOP);
6954 // AVX512 code to compare 64 byte vectors.
6955 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
6956 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6957 kortestql(k7, k7);
6958 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
6959 addq(result, 64);
6960 subq(length, 64);
6961 jccb(Assembler::notZero, VECTOR64_LOOP);
6962
6963 //bind(VECTOR64_TAIL);
6964 testq(tmp1, tmp1);
6965 jcc(Assembler::zero, SAME_TILL_END);
6966
6967 //bind(VECTOR64_TAIL);
6968 // AVX512 code to compare up to 63 byte vectors.
6969 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6970 shlxq(tmp2, tmp2, tmp1);
6971 notq(tmp2);
6972 kmovql(k3, tmp2);
6973
6974 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6975 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6976
6977 ktestql(k7, k3);
6978 jcc(Assembler::below, SAME_TILL_END); // not mismatch
6979
6980 bind(VECTOR64_NOT_EQUAL);
6981 kmovql(tmp1, k7);
6982 notq(tmp1);
6983 tzcntq(tmp1, tmp1);
6984 addq(result, tmp1);
6985 shrq(result);
6986 jmp(DONE);
6987 bind(VECTOR32_TAIL);
6988 }
6989
6990 cmpq(length, 8);
6991 jcc(Assembler::equal, VECTOR8_LOOP);
6992 jcc(Assembler::less, VECTOR4_TAIL);
6993
6994 if (UseAVX >= 2) {
6995 Label VECTOR16_TAIL, VECTOR32_LOOP;
6996
6997 cmpq(length, 16);
6998 jcc(Assembler::equal, VECTOR16_LOOP);
6999 jcc(Assembler::less, VECTOR8_LOOP);
7000
7001 cmpq(length, 32);
7002 jccb(Assembler::less, VECTOR16_TAIL);
7003
7004 subq(length, 32);
7005 bind(VECTOR32_LOOP);
7006 vmovdqu(rymm0, Address(obja, result));
7007 vmovdqu(rymm1, Address(objb, result));
7008 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
7009 vptest(rymm2, rymm2);
7010 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
7011 addq(result, 32);
7012 subq(length, 32);
7013 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
7014 addq(length, 32);
7015 jcc(Assembler::equal, SAME_TILL_END);
7016 //falling through if less than 32 bytes left //close the branch here.
7017
7018 bind(VECTOR16_TAIL);
7019 cmpq(length, 16);
7020 jccb(Assembler::less, VECTOR8_TAIL);
7021 bind(VECTOR16_LOOP);
7022 movdqu(rymm0, Address(obja, result));
7023 movdqu(rymm1, Address(objb, result));
7024 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
7025 ptest(rymm2, rymm2);
7026 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7027 addq(result, 16);
7028 subq(length, 16);
7029 jcc(Assembler::equal, SAME_TILL_END);
7030 //falling through if less than 16 bytes left
7031 } else {//regular intrinsics
7032
7033 cmpq(length, 16);
7034 jccb(Assembler::less, VECTOR8_TAIL);
7035
7036 subq(length, 16);
7037 bind(VECTOR16_LOOP);
7038 movdqu(rymm0, Address(obja, result));
7039 movdqu(rymm1, Address(objb, result));
7040 pxor(rymm0, rymm1);
7041 ptest(rymm0, rymm0);
7042 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7043 addq(result, 16);
7044 subq(length, 16);
7045 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
7046 addq(length, 16);
7047 jcc(Assembler::equal, SAME_TILL_END);
7048 //falling through if less than 16 bytes left
7049 }
7050
7051 bind(VECTOR8_TAIL);
7052 cmpq(length, 8);
7053 jccb(Assembler::less, VECTOR4_TAIL);
7054 bind(VECTOR8_LOOP);
7055 movq(tmp1, Address(obja, result));
7056 movq(tmp2, Address(objb, result));
7057 xorq(tmp1, tmp2);
7058 testq(tmp1, tmp1);
7059 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
7060 addq(result, 8);
7061 subq(length, 8);
7062 jcc(Assembler::equal, SAME_TILL_END);
7063 //falling through if less than 8 bytes left
7064
7065 bind(VECTOR4_TAIL);
7066 cmpq(length, 4);
7067 jccb(Assembler::less, BYTES_TAIL);
7068 bind(VECTOR4_LOOP);
7069 movl(tmp1, Address(obja, result));
7070 xorl(tmp1, Address(objb, result));
7071 testl(tmp1, tmp1);
7072 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
7073 addq(result, 4);
7074 subq(length, 4);
7075 jcc(Assembler::equal, SAME_TILL_END);
7076 //falling through if less than 4 bytes left
7077
7078 bind(BYTES_TAIL);
7079 bind(BYTES_LOOP);
7080 load_unsigned_byte(tmp1, Address(obja, result));
7081 load_unsigned_byte(tmp2, Address(objb, result));
7082 xorl(tmp1, tmp2);
7083 testl(tmp1, tmp1);
7084 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7085 decq(length);
7086 jcc(Assembler::zero, SAME_TILL_END);
7087 incq(result);
7088 load_unsigned_byte(tmp1, Address(obja, result));
7089 load_unsigned_byte(tmp2, Address(objb, result));
7090 xorl(tmp1, tmp2);
7091 testl(tmp1, tmp1);
7092 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7093 decq(length);
7094 jcc(Assembler::zero, SAME_TILL_END);
7095 incq(result);
7096 load_unsigned_byte(tmp1, Address(obja, result));
7097 load_unsigned_byte(tmp2, Address(objb, result));
7098 xorl(tmp1, tmp2);
7099 testl(tmp1, tmp1);
7100 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7101 jmp(SAME_TILL_END);
7102
7103 if (UseAVX >= 2) {
7104 bind(VECTOR32_NOT_EQUAL);
7105 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
7106 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
7107 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
7108 vpmovmskb(tmp1, rymm0);
7109 bsfq(tmp1, tmp1);
7110 addq(result, tmp1);
7111 shrq(result);
7112 jmp(DONE);
7113 }
7114
7115 bind(VECTOR16_NOT_EQUAL);
7116 if (UseAVX >= 2) {
7117 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
7118 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
7119 pxor(rymm0, rymm2);
7120 } else {
7121 pcmpeqb(rymm2, rymm2);
7122 pxor(rymm0, rymm1);
7123 pcmpeqb(rymm0, rymm1);
7124 pxor(rymm0, rymm2);
7125 }
7126 pmovmskb(tmp1, rymm0);
7127 bsfq(tmp1, tmp1);
7128 addq(result, tmp1);
7129 shrq(result);
7130 jmpb(DONE);
7131
7132 bind(VECTOR8_NOT_EQUAL);
7133 bind(VECTOR4_NOT_EQUAL);
7134 bsfq(tmp1, tmp1);
7135 shrq(tmp1, 3);
7136 addq(result, tmp1);
7137 bind(BYTES_NOT_EQUAL);
7138 shrq(result);
7139 jmpb(DONE);
7140
7141 bind(SAME_TILL_END);
7142 mov64(result, -1);
7143
7144 bind(DONE);
7145 }
7146
7147 //Helper functions for square_to_len()
7148
7149 /**
7150 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7151 * Preserves x and z and modifies rest of the registers.
7152 */
7153 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7154 // Perform square and right shift by 1
7155 // Handle odd xlen case first, then for even xlen do the following
7156 // jlong carry = 0;
7157 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7158 // huge_128 product = x[j:j+1] * x[j:j+1];
7159 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7160 // z[i+2:i+3] = (jlong)(product >>> 1);
7161 // carry = (jlong)product;
7162 // }
7163
7164 xorq(tmp5, tmp5); // carry
7165 xorq(rdxReg, rdxReg);
7166 xorl(tmp1, tmp1); // index for x
7167 xorl(tmp4, tmp4); // index for z
7168
7169 Label L_first_loop, L_first_loop_exit;
7170
7171 testl(xlen, 1);
7172 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7173
7174 // Square and right shift by 1 the odd element using 32 bit multiply
7175 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7176 imulq(raxReg, raxReg);
7177 shrq(raxReg, 1);
7178 adcq(tmp5, 0);
7179 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7180 incrementl(tmp1);
7181 addl(tmp4, 2);
7182
7183 // Square and right shift by 1 the rest using 64 bit multiply
7184 bind(L_first_loop);
7185 cmpptr(tmp1, xlen);
7186 jccb(Assembler::equal, L_first_loop_exit);
7187
7188 // Square
7189 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
7190 rorq(raxReg, 32); // convert big-endian to little-endian
7191 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
7192
7193 // Right shift by 1 and save carry
7194 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7195 rcrq(rdxReg, 1);
7196 rcrq(raxReg, 1);
7197 adcq(tmp5, 0);
7198
7199 // Store result in z
7200 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7201 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7202
7203 // Update indices for x and z
7204 addl(tmp1, 2);
7205 addl(tmp4, 4);
7206 jmp(L_first_loop);
7207
7208 bind(L_first_loop_exit);
7209 }
7210
7211
7212 /**
7213 * Perform the following multiply add operation using BMI2 instructions
7214 * carry:sum = sum + op1*op2 + carry
7215 * op2 should be in rdx
7216 * op2 is preserved, all other registers are modified
7217 */
7218 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7219 // assert op2 is rdx
7220 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
7221 addq(sum, carry);
7222 adcq(tmp2, 0);
7223 addq(sum, op1);
7224 adcq(tmp2, 0);
7225 movq(carry, tmp2);
7226 }
7227
7228 /**
7229 * Perform the following multiply add operation:
7230 * carry:sum = sum + op1*op2 + carry
7231 * Preserves op1, op2 and modifies rest of registers
7232 */
7233 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7234 // rdx:rax = op1 * op2
7235 movq(raxReg, op2);
7236 mulq(op1);
7237
7238 // rdx:rax = sum + carry + rdx:rax
7239 addq(sum, carry);
7240 adcq(rdxReg, 0);
7241 addq(sum, raxReg);
7242 adcq(rdxReg, 0);
7243
7244 // carry:sum = rdx:sum
7245 movq(carry, rdxReg);
7246 }
7247
7248 /**
7249 * Add 64 bit long carry into z[] with carry propagation.
7250 * Preserves z and carry register values and modifies rest of registers.
7251 *
7252 */
7253 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7254 Label L_fourth_loop, L_fourth_loop_exit;
7255
7256 movl(tmp1, 1);
7257 subl(zlen, 2);
7258 addq(Address(z, zlen, Address::times_4, 0), carry);
7259
7260 bind(L_fourth_loop);
7261 jccb(Assembler::carryClear, L_fourth_loop_exit);
7262 subl(zlen, 2);
7263 jccb(Assembler::negative, L_fourth_loop_exit);
7264 addq(Address(z, zlen, Address::times_4, 0), tmp1);
7265 jmp(L_fourth_loop);
7266 bind(L_fourth_loop_exit);
7267 }
7268
7269 /**
7270 * Shift z[] left by 1 bit.
7271 * Preserves x, len, z and zlen registers and modifies rest of the registers.
7272 *
7273 */
7274 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7275
7276 Label L_fifth_loop, L_fifth_loop_exit;
7277
7278 // Fifth loop
7279 // Perform primitiveLeftShift(z, zlen, 1)
7280
7281 const Register prev_carry = tmp1;
7282 const Register new_carry = tmp4;
7283 const Register value = tmp2;
7284 const Register zidx = tmp3;
7285
7286 // int zidx, carry;
7287 // long value;
7288 // carry = 0;
7289 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7290 // (carry:value) = (z[i] << 1) | carry ;
7291 // z[i] = value;
7292 // }
7293
7294 movl(zidx, zlen);
7295 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7296
7297 bind(L_fifth_loop);
7298 decl(zidx); // Use decl to preserve carry flag
7299 decl(zidx);
7300 jccb(Assembler::negative, L_fifth_loop_exit);
7301
7302 if (UseBMI2Instructions) {
7303 movq(value, Address(z, zidx, Address::times_4, 0));
7304 rclq(value, 1);
7305 rorxq(value, value, 32);
7306 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7307 }
7308 else {
7309 // clear new_carry
7310 xorl(new_carry, new_carry);
7311
7312 // Shift z[i] by 1, or in previous carry and save new carry
7313 movq(value, Address(z, zidx, Address::times_4, 0));
7314 shlq(value, 1);
7315 adcl(new_carry, 0);
7316
7317 orq(value, prev_carry);
7318 rorq(value, 0x20);
7319 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7320
7321 // Set previous carry = new carry
7322 movl(prev_carry, new_carry);
7323 }
7324 jmp(L_fifth_loop);
7325
7326 bind(L_fifth_loop_exit);
7327 }
7328
7329
7330 /**
7331 * Code for BigInteger::squareToLen() intrinsic
7332 *
7333 * rdi: x
7334 * rsi: len
7335 * r8: z
7336 * rcx: zlen
7337 * r12: tmp1
7338 * r13: tmp2
7339 * r14: tmp3
7340 * r15: tmp4
7341 * rbx: tmp5
7342 *
7343 */
7344 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7345
7346 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
7347 push(tmp1);
7348 push(tmp2);
7349 push(tmp3);
7350 push(tmp4);
7351 push(tmp5);
7352
7353 // First loop
7354 // Store the squares, right shifted one bit (i.e., divided by 2).
7355 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7356
7357 // Add in off-diagonal sums.
7358 //
7359 // Second, third (nested) and fourth loops.
7360 // zlen +=2;
7361 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7362 // carry = 0;
7363 // long op2 = x[xidx:xidx+1];
7364 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7365 // k -= 2;
7366 // long op1 = x[j:j+1];
7367 // long sum = z[k:k+1];
7368 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7369 // z[k:k+1] = sum;
7370 // }
7371 // add_one_64(z, k, carry, tmp_regs);
7372 // }
7373
7374 const Register carry = tmp5;
7375 const Register sum = tmp3;
7376 const Register op1 = tmp4;
7377 Register op2 = tmp2;
7378
7379 push(zlen);
7380 push(len);
7381 addl(zlen,2);
7382 bind(L_second_loop);
7383 xorq(carry, carry);
7384 subl(zlen, 4);
7385 subl(len, 2);
7386 push(zlen);
7387 push(len);
7388 cmpl(len, 0);
7389 jccb(Assembler::lessEqual, L_second_loop_exit);
7390
7391 // Multiply an array by one 64 bit long.
7392 if (UseBMI2Instructions) {
7393 op2 = rdxReg;
7394 movq(op2, Address(x, len, Address::times_4, 0));
7395 rorxq(op2, op2, 32);
7396 }
7397 else {
7398 movq(op2, Address(x, len, Address::times_4, 0));
7399 rorq(op2, 32);
7400 }
7401
7402 bind(L_third_loop);
7403 decrementl(len);
7404 jccb(Assembler::negative, L_third_loop_exit);
7405 decrementl(len);
7406 jccb(Assembler::negative, L_last_x);
7407
7408 movq(op1, Address(x, len, Address::times_4, 0));
7409 rorq(op1, 32);
7410
7411 bind(L_multiply);
7412 subl(zlen, 2);
7413 movq(sum, Address(z, zlen, Address::times_4, 0));
7414
7415 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
7416 if (UseBMI2Instructions) {
7417 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
7418 }
7419 else {
7420 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7421 }
7422
7423 movq(Address(z, zlen, Address::times_4, 0), sum);
7424
7425 jmp(L_third_loop);
7426 bind(L_third_loop_exit);
7427
7428 // Fourth loop
7429 // Add 64 bit long carry into z with carry propagation.
7430 // Uses offsetted zlen.
7431 add_one_64(z, zlen, carry, tmp1);
7432
7433 pop(len);
7434 pop(zlen);
7435 jmp(L_second_loop);
7436
7437 // Next infrequent code is moved outside loops.
7438 bind(L_last_x);
7439 movl(op1, Address(x, 0));
7440 jmp(L_multiply);
7441
7442 bind(L_second_loop_exit);
7443 pop(len);
7444 pop(zlen);
7445 pop(len);
7446 pop(zlen);
7447
7448 // Fifth loop
7449 // Shift z left 1 bit.
7450 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7451
7452 // z[zlen-1] |= x[len-1] & 1;
7453 movl(tmp3, Address(x, len, Address::times_4, -4));
7454 andl(tmp3, 1);
7455 orl(Address(z, zlen, Address::times_4, -4), tmp3);
7456
7457 pop(tmp5);
7458 pop(tmp4);
7459 pop(tmp3);
7460 pop(tmp2);
7461 pop(tmp1);
7462 }
7463
7464 /**
7465 * Helper function for mul_add()
7466 * Multiply the in[] by int k and add to out[] starting at offset offs using
7467 * 128 bit by 32 bit multiply and return the carry in tmp5.
7468 * Only quad int aligned length of in[] is operated on in this function.
7469 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7470 * This function preserves out, in and k registers.
7471 * len and offset point to the appropriate index in "in" & "out" correspondingly
7472 * tmp5 has the carry.
7473 * other registers are temporary and are modified.
7474 *
7475 */
7476 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7477 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7478 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7479
7480 Label L_first_loop, L_first_loop_exit;
7481
7482 movl(tmp1, len);
7483 shrl(tmp1, 2);
7484
7485 bind(L_first_loop);
7486 subl(tmp1, 1);
7487 jccb(Assembler::negative, L_first_loop_exit);
7488
7489 subl(len, 4);
7490 subl(offset, 4);
7491
7492 Register op2 = tmp2;
7493 const Register sum = tmp3;
7494 const Register op1 = tmp4;
7495 const Register carry = tmp5;
7496
7497 if (UseBMI2Instructions) {
7498 op2 = rdxReg;
7499 }
7500
7501 movq(op1, Address(in, len, Address::times_4, 8));
7502 rorq(op1, 32);
7503 movq(sum, Address(out, offset, Address::times_4, 8));
7504 rorq(sum, 32);
7505 if (UseBMI2Instructions) {
7506 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7507 }
7508 else {
7509 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7510 }
7511 // Store back in big endian from little endian
7512 rorq(sum, 0x20);
7513 movq(Address(out, offset, Address::times_4, 8), sum);
7514
7515 movq(op1, Address(in, len, Address::times_4, 0));
7516 rorq(op1, 32);
7517 movq(sum, Address(out, offset, Address::times_4, 0));
7518 rorq(sum, 32);
7519 if (UseBMI2Instructions) {
7520 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7521 }
7522 else {
7523 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7524 }
7525 // Store back in big endian from little endian
7526 rorq(sum, 0x20);
7527 movq(Address(out, offset, Address::times_4, 0), sum);
7528
7529 jmp(L_first_loop);
7530 bind(L_first_loop_exit);
7531 }
7532
7533 /**
7534 * Code for BigInteger::mulAdd() intrinsic
7535 *
7536 * rdi: out
7537 * rsi: in
7538 * r11: offs (out.length - offset)
7539 * rcx: len
7540 * r8: k
7541 * r12: tmp1
7542 * r13: tmp2
7543 * r14: tmp3
7544 * r15: tmp4
7545 * rbx: tmp5
7546 * Multiply the in[] by word k and add to out[], return the carry in rax
7547 */
7548 void MacroAssembler::mul_add(Register out, Register in, Register offs,
7549 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
7550 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7551
7552 Label L_carry, L_last_in, L_done;
7553
7554 // carry = 0;
7555 // for (int j=len-1; j >= 0; j--) {
7556 // long product = (in[j] & LONG_MASK) * kLong +
7557 // (out[offs] & LONG_MASK) + carry;
7558 // out[offs--] = (int)product;
7559 // carry = product >>> 32;
7560 // }
7561 //
7562 push(tmp1);
7563 push(tmp2);
7564 push(tmp3);
7565 push(tmp4);
7566 push(tmp5);
7567
7568 Register op2 = tmp2;
7569 const Register sum = tmp3;
7570 const Register op1 = tmp4;
7571 const Register carry = tmp5;
7572
7573 if (UseBMI2Instructions) {
7574 op2 = rdxReg;
7575 movl(op2, k);
7576 }
7577 else {
7578 movl(op2, k);
7579 }
7580
7581 xorq(carry, carry);
7582
7583 //First loop
7584
7585 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7586 //The carry is in tmp5
7587 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7588
7589 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7590 decrementl(len);
7591 jccb(Assembler::negative, L_carry);
7592 decrementl(len);
7593 jccb(Assembler::negative, L_last_in);
7594
7595 movq(op1, Address(in, len, Address::times_4, 0));
7596 rorq(op1, 32);
7597
7598 subl(offs, 2);
7599 movq(sum, Address(out, offs, Address::times_4, 0));
7600 rorq(sum, 32);
7601
7602 if (UseBMI2Instructions) {
7603 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7604 }
7605 else {
7606 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7607 }
7608
7609 // Store back in big endian from little endian
7610 rorq(sum, 0x20);
7611 movq(Address(out, offs, Address::times_4, 0), sum);
7612
7613 testl(len, len);
7614 jccb(Assembler::zero, L_carry);
7615
7616 //Multiply the last in[] entry, if any
7617 bind(L_last_in);
7618 movl(op1, Address(in, 0));
7619 movl(sum, Address(out, offs, Address::times_4, -4));
7620
7621 movl(raxReg, k);
7622 mull(op1); //tmp4 * eax -> edx:eax
7623 addl(sum, carry);
7624 adcl(rdxReg, 0);
7625 addl(sum, raxReg);
7626 adcl(rdxReg, 0);
7627 movl(carry, rdxReg);
7628
7629 movl(Address(out, offs, Address::times_4, -4), sum);
7630
7631 bind(L_carry);
7632 //return tmp5/carry as carry in rax
7633 movl(rax, carry);
7634
7635 bind(L_done);
7636 pop(tmp5);
7637 pop(tmp4);
7638 pop(tmp3);
7639 pop(tmp2);
7640 pop(tmp1);
7641 }
7642
7643 /**
7644 * Emits code to update CRC-32 with a byte value according to constants in table
7645 *
7646 * @param [in,out]crc Register containing the crc.
7647 * @param [in]val Register containing the byte to fold into the CRC.
7648 * @param [in]table Register containing the table of crc constants.
7649 *
7650 * uint32_t crc;
7651 * val = crc_table[(val ^ crc) & 0xFF];
7652 * crc = val ^ (crc >> 8);
7653 *
7654 */
7655 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7656 xorl(val, crc);
7657 andl(val, 0xFF);
7658 shrl(crc, 8); // unsigned shift
7659 xorl(crc, Address(table, val, Address::times_4, 0));
7660 }
7661
7662 /**
7663 * Fold 128-bit data chunk
7664 */
7665 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7666 if (UseAVX > 0) {
7667 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7668 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7669 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7670 pxor(xcrc, xtmp);
7671 } else {
7672 movdqa(xtmp, xcrc);
7673 pclmulhdq(xtmp, xK); // [123:64]
7674 pclmulldq(xcrc, xK); // [63:0]
7675 pxor(xcrc, xtmp);
7676 movdqu(xtmp, Address(buf, offset));
7677 pxor(xcrc, xtmp);
7678 }
7679 }
7680
7681 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7682 if (UseAVX > 0) {
7683 vpclmulhdq(xtmp, xK, xcrc);
7684 vpclmulldq(xcrc, xK, xcrc);
7685 pxor(xcrc, xbuf);
7686 pxor(xcrc, xtmp);
7687 } else {
7688 movdqa(xtmp, xcrc);
7689 pclmulhdq(xtmp, xK);
7690 pclmulldq(xcrc, xK);
7691 pxor(xcrc, xbuf);
7692 pxor(xcrc, xtmp);
7693 }
7694 }
7695
7696 /**
7697 * 8-bit folds to compute 32-bit CRC
7698 *
7699 * uint64_t xcrc;
7700 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7701 */
7702 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7703 movdl(tmp, xcrc);
7704 andl(tmp, 0xFF);
7705 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7706 psrldq(xcrc, 1); // unsigned shift one byte
7707 pxor(xcrc, xtmp);
7708 }
7709
7710 /**
7711 * uint32_t crc;
7712 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7713 */
7714 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7715 movl(tmp, crc);
7716 andl(tmp, 0xFF);
7717 shrl(crc, 8);
7718 xorl(crc, Address(table, tmp, Address::times_4, 0));
7719 }
7720
7721 /**
7722 * @param crc register containing existing CRC (32-bit)
7723 * @param buf register pointing to input byte buffer (byte*)
7724 * @param len register containing number of bytes
7725 * @param table register that will contain address of CRC table
7726 * @param tmp scratch register
7727 */
7728 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7729 assert_different_registers(crc, buf, len, table, tmp, rax);
7730
7731 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7732 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7733
7734 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7735 // context for the registers used, where all instructions below are using 128-bit mode
7736 // On EVEX without VL and BW, these instructions will all be AVX.
7737 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7738 notl(crc); // ~crc
7739 cmpl(len, 16);
7740 jcc(Assembler::less, L_tail);
7741
7742 // Align buffer to 16 bytes
7743 movl(tmp, buf);
7744 andl(tmp, 0xF);
7745 jccb(Assembler::zero, L_aligned);
7746 subl(tmp, 16);
7747 addl(len, tmp);
7748
7749 align(4);
7750 BIND(L_align_loop);
7751 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7752 update_byte_crc32(crc, rax, table);
7753 increment(buf);
7754 incrementl(tmp);
7755 jccb(Assembler::less, L_align_loop);
7756
7757 BIND(L_aligned);
7758 movl(tmp, len); // save
7759 shrl(len, 4);
7760 jcc(Assembler::zero, L_tail_restore);
7761
7762 // Fold crc into first bytes of vector
7763 movdqa(xmm1, Address(buf, 0));
7764 movdl(rax, xmm1);
7765 xorl(crc, rax);
7766 if (VM_Version::supports_sse4_1()) {
7767 pinsrd(xmm1, crc, 0);
7768 } else {
7769 pinsrw(xmm1, crc, 0);
7770 shrl(crc, 16);
7771 pinsrw(xmm1, crc, 1);
7772 }
7773 addptr(buf, 16);
7774 subl(len, 4); // len > 0
7775 jcc(Assembler::less, L_fold_tail);
7776
7777 movdqa(xmm2, Address(buf, 0));
7778 movdqa(xmm3, Address(buf, 16));
7779 movdqa(xmm4, Address(buf, 32));
7780 addptr(buf, 48);
7781 subl(len, 3);
7782 jcc(Assembler::lessEqual, L_fold_512b);
7783
7784 // Fold total 512 bits of polynomial on each iteration,
7785 // 128 bits per each of 4 parallel streams.
7786 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
7787
7788 align32();
7789 BIND(L_fold_512b_loop);
7790 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7791 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7792 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7793 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7794 addptr(buf, 64);
7795 subl(len, 4);
7796 jcc(Assembler::greater, L_fold_512b_loop);
7797
7798 // Fold 512 bits to 128 bits.
7799 BIND(L_fold_512b);
7800 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7801 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7802 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7803 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7804
7805 // Fold the rest of 128 bits data chunks
7806 BIND(L_fold_tail);
7807 addl(len, 3);
7808 jccb(Assembler::lessEqual, L_fold_128b);
7809 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7810
7811 BIND(L_fold_tail_loop);
7812 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7813 addptr(buf, 16);
7814 decrementl(len);
7815 jccb(Assembler::greater, L_fold_tail_loop);
7816
7817 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7818 BIND(L_fold_128b);
7819 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
7820 if (UseAVX > 0) {
7821 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7822 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7823 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7824 } else {
7825 movdqa(xmm2, xmm0);
7826 pclmulqdq(xmm2, xmm1, 0x1);
7827 movdqa(xmm3, xmm0);
7828 pand(xmm3, xmm2);
7829 pclmulqdq(xmm0, xmm3, 0x1);
7830 }
7831 psrldq(xmm1, 8);
7832 psrldq(xmm2, 4);
7833 pxor(xmm0, xmm1);
7834 pxor(xmm0, xmm2);
7835
7836 // 8 8-bit folds to compute 32-bit CRC.
7837 for (int j = 0; j < 4; j++) {
7838 fold_8bit_crc32(xmm0, table, xmm1, rax);
7839 }
7840 movdl(crc, xmm0); // mov 32 bits to general register
7841 for (int j = 0; j < 4; j++) {
7842 fold_8bit_crc32(crc, table, rax);
7843 }
7844
7845 BIND(L_tail_restore);
7846 movl(len, tmp); // restore
7847 BIND(L_tail);
7848 andl(len, 0xf);
7849 jccb(Assembler::zero, L_exit);
7850
7851 // Fold the rest of bytes
7852 align(4);
7853 BIND(L_tail_loop);
7854 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7855 update_byte_crc32(crc, rax, table);
7856 increment(buf);
7857 decrementl(len);
7858 jccb(Assembler::greater, L_tail_loop);
7859
7860 BIND(L_exit);
7861 notl(crc); // ~c
7862 }
7863
7864 // Helper function for AVX 512 CRC32
7865 // Fold 512-bit data chunks
7866 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7867 Register pos, int offset) {
7868 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7869 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7870 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7871 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7872 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7873 }
7874
7875 // Helper function for AVX 512 CRC32
7876 // Compute CRC32 for < 256B buffers
7877 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7878 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7879 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7880
7881 Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7882 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7883 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7884
7885 // check if there is enough buffer to be able to fold 16B at a time
7886 cmpl(len, 32);
7887 jcc(Assembler::less, L_less_than_32);
7888
7889 // if there is, load the constants
7890 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
7891 movdl(xmm0, crc); // get the initial crc value
7892 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7893 pxor(xmm7, xmm0);
7894
7895 // update the buffer pointer
7896 addl(pos, 16);
7897 //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7898 subl(len, 32);
7899 jmp(L_16B_reduction_loop);
7900
7901 bind(L_less_than_32);
7902 //mov initial crc to the return value. this is necessary for zero - length buffers.
7903 movl(rax, crc);
7904 testl(len, len);
7905 jcc(Assembler::equal, L_cleanup);
7906
7907 movdl(xmm0, crc); //get the initial crc value
7908
7909 cmpl(len, 16);
7910 jcc(Assembler::equal, L_exact_16_left);
7911 jcc(Assembler::less, L_less_than_16_left);
7912
7913 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7914 pxor(xmm7, xmm0); //xor the initial crc value
7915 addl(pos, 16);
7916 subl(len, 16);
7917 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
7918 jmp(L_get_last_two_xmms);
7919
7920 bind(L_less_than_16_left);
7921 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7922 pxor(xmm1, xmm1);
7923 movptr(tmp1, rsp);
7924 movdqu(Address(tmp1, 0 * 16), xmm1);
7925
7926 cmpl(len, 4);
7927 jcc(Assembler::less, L_only_less_than_4);
7928
7929 //backup the counter value
7930 movl(tmp2, len);
7931 cmpl(len, 8);
7932 jcc(Assembler::less, L_less_than_8_left);
7933
7934 //load 8 Bytes
7935 movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7936 movq(Address(tmp1, 0 * 16), rax);
7937 addptr(tmp1, 8);
7938 subl(len, 8);
7939 addl(pos, 8);
7940
7941 bind(L_less_than_8_left);
7942 cmpl(len, 4);
7943 jcc(Assembler::less, L_less_than_4_left);
7944
7945 //load 4 Bytes
7946 movl(rax, Address(buf, pos, Address::times_1, 0));
7947 movl(Address(tmp1, 0 * 16), rax);
7948 addptr(tmp1, 4);
7949 subl(len, 4);
7950 addl(pos, 4);
7951
7952 bind(L_less_than_4_left);
7953 cmpl(len, 2);
7954 jcc(Assembler::less, L_less_than_2_left);
7955
7956 // load 2 Bytes
7957 movw(rax, Address(buf, pos, Address::times_1, 0));
7958 movl(Address(tmp1, 0 * 16), rax);
7959 addptr(tmp1, 2);
7960 subl(len, 2);
7961 addl(pos, 2);
7962
7963 bind(L_less_than_2_left);
7964 cmpl(len, 1);
7965 jcc(Assembler::less, L_zero_left);
7966
7967 // load 1 Byte
7968 movb(rax, Address(buf, pos, Address::times_1, 0));
7969 movb(Address(tmp1, 0 * 16), rax);
7970
7971 bind(L_zero_left);
7972 movdqu(xmm7, Address(rsp, 0));
7973 pxor(xmm7, xmm0); //xor the initial crc value
7974
7975 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7976 movdqu(xmm0, Address(rax, tmp2));
7977 pshufb(xmm7, xmm0);
7978 jmp(L_128_done);
7979
7980 bind(L_exact_16_left);
7981 movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7982 pxor(xmm7, xmm0); //xor the initial crc value
7983 jmp(L_128_done);
7984
7985 bind(L_only_less_than_4);
7986 cmpl(len, 3);
7987 jcc(Assembler::less, L_only_less_than_3);
7988
7989 // load 3 Bytes
7990 movb(rax, Address(buf, pos, Address::times_1, 0));
7991 movb(Address(tmp1, 0), rax);
7992
7993 movb(rax, Address(buf, pos, Address::times_1, 1));
7994 movb(Address(tmp1, 1), rax);
7995
7996 movb(rax, Address(buf, pos, Address::times_1, 2));
7997 movb(Address(tmp1, 2), rax);
7998
7999 movdqu(xmm7, Address(rsp, 0));
8000 pxor(xmm7, xmm0); //xor the initial crc value
8001
8002 pslldq(xmm7, 0x5);
8003 jmp(L_barrett);
8004 bind(L_only_less_than_3);
8005 cmpl(len, 2);
8006 jcc(Assembler::less, L_only_less_than_2);
8007
8008 // load 2 Bytes
8009 movb(rax, Address(buf, pos, Address::times_1, 0));
8010 movb(Address(tmp1, 0), rax);
8011
8012 movb(rax, Address(buf, pos, Address::times_1, 1));
8013 movb(Address(tmp1, 1), rax);
8014
8015 movdqu(xmm7, Address(rsp, 0));
8016 pxor(xmm7, xmm0); //xor the initial crc value
8017
8018 pslldq(xmm7, 0x6);
8019 jmp(L_barrett);
8020
8021 bind(L_only_less_than_2);
8022 //load 1 Byte
8023 movb(rax, Address(buf, pos, Address::times_1, 0));
8024 movb(Address(tmp1, 0), rax);
8025
8026 movdqu(xmm7, Address(rsp, 0));
8027 pxor(xmm7, xmm0); //xor the initial crc value
8028
8029 pslldq(xmm7, 0x7);
8030 }
8031
8032 /**
8033 * Compute CRC32 using AVX512 instructions
8034 * param crc register containing existing CRC (32-bit)
8035 * param buf register pointing to input byte buffer (byte*)
8036 * param len register containing number of bytes
8037 * param table address of crc or crc32c table
8038 * param tmp1 scratch register
8039 * param tmp2 scratch register
8040 * return rax result register
8041 *
8042 * This routine is identical for crc32c with the exception of the precomputed constant
8043 * table which will be passed as the table argument. The calculation steps are
8044 * the same for both variants.
8045 */
8046 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
8047 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
8048
8049 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8050 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8051 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
8052 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
8053 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
8054
8055 const Register pos = r12;
8056 push(r12);
8057 subptr(rsp, 16 * 2 + 8);
8058
8059 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8060 // context for the registers used, where all instructions below are using 128-bit mode
8061 // On EVEX without VL and BW, these instructions will all be AVX.
8062 movl(pos, 0);
8063
8064 // check if smaller than 256B
8065 cmpl(len, 256);
8066 jcc(Assembler::less, L_less_than_256);
8067
8068 // load the initial crc value
8069 movdl(xmm10, crc);
8070
8071 // receive the initial 64B data, xor the initial crc value
8072 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
8073 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
8074 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
8075 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
8076
8077 subl(len, 256);
8078 cmpl(len, 256);
8079 jcc(Assembler::less, L_fold_128_B_loop);
8080
8081 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
8082 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
8083 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
8084 subl(len, 256);
8085
8086 bind(L_fold_256_B_loop);
8087 addl(pos, 256);
8088 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
8089 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
8090 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
8091 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
8092
8093 subl(len, 256);
8094 jcc(Assembler::greaterEqual, L_fold_256_B_loop);
8095
8096 // Fold 256 into 128
8097 addl(pos, 256);
8098 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
8099 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
8100 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
8101
8102 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
8103 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
8104 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
8105
8106 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
8107 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
8108
8109 addl(len, 128);
8110 jmp(L_fold_128_B_register);
8111
8112 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
8113 // loop will fold 128B at a time until we have 128 + y Bytes of buffer
8114
8115 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
8116 bind(L_fold_128_B_loop);
8117 addl(pos, 128);
8118 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
8119 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
8120
8121 subl(len, 128);
8122 jcc(Assembler::greaterEqual, L_fold_128_B_loop);
8123
8124 addl(pos, 128);
8125
8126 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
8127 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
8128 bind(L_fold_128_B_register);
8129 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
8130 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
8131 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
8132 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
8133 // save last that has no multiplicand
8134 vextracti64x2(xmm7, xmm4, 3);
8135
8136 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
8137 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
8138 // Needed later in reduction loop
8139 movdqu(xmm10, Address(table, 1 * 16));
8140 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
8141 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
8142
8143 // Swap 1,0,3,2 - 01 00 11 10
8144 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
8145 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
8146 vextracti128(xmm5, xmm8, 1);
8147 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
8148
8149 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
8150 // instead of a cmp instruction, we use the negative flag with the jl instruction
8151 addl(len, 128 - 16);
8152 jcc(Assembler::less, L_final_reduction_for_128);
8153
8154 bind(L_16B_reduction_loop);
8155 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8156 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8157 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8158 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
8159 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8160 addl(pos, 16);
8161 subl(len, 16);
8162 jcc(Assembler::greaterEqual, L_16B_reduction_loop);
8163
8164 bind(L_final_reduction_for_128);
8165 addl(len, 16);
8166 jcc(Assembler::equal, L_128_done);
8167
8168 bind(L_get_last_two_xmms);
8169 movdqu(xmm2, xmm7);
8170 addl(pos, len);
8171 movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
8172 subl(pos, len);
8173
8174 // get rid of the extra data that was loaded before
8175 // load the shift constant
8176 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8177 movdqu(xmm0, Address(rax, len));
8178 addl(rax, len);
8179
8180 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8181 //Change mask to 512
8182 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
8183 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
8184
8185 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
8186 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8187 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8188 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8189 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
8190
8191 bind(L_128_done);
8192 // compute crc of a 128-bit value
8193 movdqu(xmm10, Address(table, 3 * 16));
8194 movdqu(xmm0, xmm7);
8195
8196 // 64b fold
8197 vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
8198 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
8199 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8200
8201 // 32b fold
8202 movdqu(xmm0, xmm7);
8203 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
8204 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8205 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8206 jmp(L_barrett);
8207
8208 bind(L_less_than_256);
8209 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
8210
8211 //barrett reduction
8212 bind(L_barrett);
8213 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
8214 movdqu(xmm1, xmm7);
8215 movdqu(xmm2, xmm7);
8216 movdqu(xmm10, Address(table, 4 * 16));
8217
8218 pclmulqdq(xmm7, xmm10, 0x0);
8219 pxor(xmm7, xmm2);
8220 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
8221 movdqu(xmm2, xmm7);
8222 pclmulqdq(xmm7, xmm10, 0x10);
8223 pxor(xmm7, xmm2);
8224 pxor(xmm7, xmm1);
8225 pextrd(crc, xmm7, 2);
8226
8227 bind(L_cleanup);
8228 addptr(rsp, 16 * 2 + 8);
8229 pop(r12);
8230 }
8231
8232 // S. Gueron / Information Processing Letters 112 (2012) 184
8233 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8234 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8235 // Output: the 64-bit carry-less product of B * CONST
8236 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8237 Register tmp1, Register tmp2, Register tmp3) {
8238 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8239 if (n > 0) {
8240 addq(tmp3, n * 256 * 8);
8241 }
8242 // Q1 = TABLEExt[n][B & 0xFF];
8243 movl(tmp1, in);
8244 andl(tmp1, 0x000000FF);
8245 shll(tmp1, 3);
8246 addq(tmp1, tmp3);
8247 movq(tmp1, Address(tmp1, 0));
8248
8249 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
8250 movl(tmp2, in);
8251 shrl(tmp2, 8);
8252 andl(tmp2, 0x000000FF);
8253 shll(tmp2, 3);
8254 addq(tmp2, tmp3);
8255 movq(tmp2, Address(tmp2, 0));
8256
8257 shlq(tmp2, 8);
8258 xorq(tmp1, tmp2);
8259
8260 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
8261 movl(tmp2, in);
8262 shrl(tmp2, 16);
8263 andl(tmp2, 0x000000FF);
8264 shll(tmp2, 3);
8265 addq(tmp2, tmp3);
8266 movq(tmp2, Address(tmp2, 0));
8267
8268 shlq(tmp2, 16);
8269 xorq(tmp1, tmp2);
8270
8271 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
8272 shrl(in, 24);
8273 andl(in, 0x000000FF);
8274 shll(in, 3);
8275 addq(in, tmp3);
8276 movq(in, Address(in, 0));
8277
8278 shlq(in, 24);
8279 xorq(in, tmp1);
8280 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8281 }
8282
8283 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8284 Register in_out,
8285 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8286 XMMRegister w_xtmp2,
8287 Register tmp1,
8288 Register n_tmp2, Register n_tmp3) {
8289 if (is_pclmulqdq_supported) {
8290 movdl(w_xtmp1, in_out); // modified blindly
8291
8292 movl(tmp1, const_or_pre_comp_const_index);
8293 movdl(w_xtmp2, tmp1);
8294 pclmulqdq(w_xtmp1, w_xtmp2, 0);
8295
8296 movdq(in_out, w_xtmp1);
8297 } else {
8298 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8299 }
8300 }
8301
8302 // Recombination Alternative 2: No bit-reflections
8303 // T1 = (CRC_A * U1) << 1
8304 // T2 = (CRC_B * U2) << 1
8305 // C1 = T1 >> 32
8306 // C2 = T2 >> 32
8307 // T1 = T1 & 0xFFFFFFFF
8308 // T2 = T2 & 0xFFFFFFFF
8309 // T1 = CRC32(0, T1)
8310 // T2 = CRC32(0, T2)
8311 // C1 = C1 ^ T1
8312 // C2 = C2 ^ T2
8313 // CRC = C1 ^ C2 ^ CRC_C
8314 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8315 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8316 Register tmp1, Register tmp2,
8317 Register n_tmp3) {
8318 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8319 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8320 shlq(in_out, 1);
8321 movl(tmp1, in_out);
8322 shrq(in_out, 32);
8323 xorl(tmp2, tmp2);
8324 crc32(tmp2, tmp1, 4);
8325 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8326 shlq(in1, 1);
8327 movl(tmp1, in1);
8328 shrq(in1, 32);
8329 xorl(tmp2, tmp2);
8330 crc32(tmp2, tmp1, 4);
8331 xorl(in1, tmp2);
8332 xorl(in_out, in1);
8333 xorl(in_out, in2);
8334 }
8335
8336 // Set N to predefined value
8337 // Subtract from a length of a buffer
8338 // execute in a loop:
8339 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8340 // for i = 1 to N do
8341 // CRC_A = CRC32(CRC_A, A[i])
8342 // CRC_B = CRC32(CRC_B, B[i])
8343 // CRC_C = CRC32(CRC_C, C[i])
8344 // end for
8345 // Recombine
8346 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8347 Register in_out1, Register in_out2, Register in_out3,
8348 Register tmp1, Register tmp2, Register tmp3,
8349 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8350 Register tmp4, Register tmp5,
8351 Register n_tmp6) {
8352 Label L_processPartitions;
8353 Label L_processPartition;
8354 Label L_exit;
8355
8356 bind(L_processPartitions);
8357 cmpl(in_out1, 3 * size);
8358 jcc(Assembler::less, L_exit);
8359 xorl(tmp1, tmp1);
8360 xorl(tmp2, tmp2);
8361 movq(tmp3, in_out2);
8362 addq(tmp3, size);
8363
8364 bind(L_processPartition);
8365 crc32(in_out3, Address(in_out2, 0), 8);
8366 crc32(tmp1, Address(in_out2, size), 8);
8367 crc32(tmp2, Address(in_out2, size * 2), 8);
8368 addq(in_out2, 8);
8369 cmpq(in_out2, tmp3);
8370 jcc(Assembler::less, L_processPartition);
8371 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8372 w_xtmp1, w_xtmp2, w_xtmp3,
8373 tmp4, tmp5,
8374 n_tmp6);
8375 addq(in_out2, 2 * size);
8376 subl(in_out1, 3 * size);
8377 jmp(L_processPartitions);
8378
8379 bind(L_exit);
8380 }
8381
8382 // Algorithm 2: Pipelined usage of the CRC32 instruction.
8383 // Input: A buffer I of L bytes.
8384 // Output: the CRC32C value of the buffer.
8385 // Notations:
8386 // Write L = 24N + r, with N = floor (L/24).
8387 // r = L mod 24 (0 <= r < 24).
8388 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8389 // N quadwords, and R consists of r bytes.
8390 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8391 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8392 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8393 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8394 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8395 Register tmp1, Register tmp2, Register tmp3,
8396 Register tmp4, Register tmp5, Register tmp6,
8397 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8398 bool is_pclmulqdq_supported) {
8399 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8400 Label L_wordByWord;
8401 Label L_byteByByteProlog;
8402 Label L_byteByByte;
8403 Label L_exit;
8404
8405 if (is_pclmulqdq_supported ) {
8406 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr();
8407 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1);
8408
8409 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2);
8410 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3);
8411
8412 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4);
8413 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5);
8414 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8415 } else {
8416 const_or_pre_comp_const_index[0] = 1;
8417 const_or_pre_comp_const_index[1] = 0;
8418
8419 const_or_pre_comp_const_index[2] = 3;
8420 const_or_pre_comp_const_index[3] = 2;
8421
8422 const_or_pre_comp_const_index[4] = 5;
8423 const_or_pre_comp_const_index[5] = 4;
8424 }
8425 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8426 in2, in1, in_out,
8427 tmp1, tmp2, tmp3,
8428 w_xtmp1, w_xtmp2, w_xtmp3,
8429 tmp4, tmp5,
8430 tmp6);
8431 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8432 in2, in1, in_out,
8433 tmp1, tmp2, tmp3,
8434 w_xtmp1, w_xtmp2, w_xtmp3,
8435 tmp4, tmp5,
8436 tmp6);
8437 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8438 in2, in1, in_out,
8439 tmp1, tmp2, tmp3,
8440 w_xtmp1, w_xtmp2, w_xtmp3,
8441 tmp4, tmp5,
8442 tmp6);
8443 movl(tmp1, in2);
8444 andl(tmp1, 0x00000007);
8445 negl(tmp1);
8446 addl(tmp1, in2);
8447 addq(tmp1, in1);
8448
8449 cmpq(in1, tmp1);
8450 jccb(Assembler::greaterEqual, L_byteByByteProlog);
8451 align(16);
8452 BIND(L_wordByWord);
8453 crc32(in_out, Address(in1, 0), 8);
8454 addq(in1, 8);
8455 cmpq(in1, tmp1);
8456 jcc(Assembler::less, L_wordByWord);
8457
8458 BIND(L_byteByByteProlog);
8459 andl(in2, 0x00000007);
8460 movl(tmp2, 1);
8461
8462 cmpl(tmp2, in2);
8463 jccb(Assembler::greater, L_exit);
8464 BIND(L_byteByByte);
8465 crc32(in_out, Address(in1, 0), 1);
8466 incq(in1);
8467 incl(tmp2);
8468 cmpl(tmp2, in2);
8469 jcc(Assembler::lessEqual, L_byteByByte);
8470
8471 BIND(L_exit);
8472 }
8473 #undef BIND
8474 #undef BLOCK_COMMENT
8475
8476 // Compress char[] array to byte[].
8477 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
8478 // Return the array length if every element in array can be encoded,
8479 // otherwise, the index of first non-latin1 (> 0xff) character.
8480 // @IntrinsicCandidate
8481 // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8482 // for (int i = 0; i < len; i++) {
8483 // char c = src[srcOff];
8484 // if (c > 0xff) {
8485 // return i; // return index of non-latin1 char
8486 // }
8487 // dst[dstOff] = (byte)c;
8488 // srcOff++;
8489 // dstOff++;
8490 // }
8491 // return len;
8492 // }
8493 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8494 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8495 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8496 Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8497 Label copy_chars_loop, done, reset_sp, copy_tail;
8498
8499 // rsi: src
8500 // rdi: dst
8501 // rdx: len
8502 // rcx: tmp5
8503 // rax: result
8504
8505 // rsi holds start addr of source char[] to be compressed
8506 // rdi holds start addr of destination byte[]
8507 // rdx holds length
8508
8509 assert(len != result, "");
8510
8511 // save length for return
8512 movl(result, len);
8513
8514 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8515 VM_Version::supports_avx512vlbw() &&
8516 VM_Version::supports_bmi2()) {
8517
8518 Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
8519
8520 // alignment
8521 Label post_alignment;
8522
8523 // if length of the string is less than 32, handle it the old fashioned way
8524 testl(len, -32);
8525 jcc(Assembler::zero, below_threshold);
8526
8527 // First check whether a character is compressible ( <= 0xFF).
8528 // Create mask to test for Unicode chars inside zmm vector
8529 movl(tmp5, 0x00FF);
8530 evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
8531
8532 testl(len, -64);
8533 jccb(Assembler::zero, post_alignment);
8534
8535 movl(tmp5, dst);
8536 andl(tmp5, (32 - 1));
8537 negl(tmp5);
8538 andl(tmp5, (32 - 1));
8539
8540 // bail out when there is nothing to be done
8541 testl(tmp5, 0xFFFFFFFF);
8542 jccb(Assembler::zero, post_alignment);
8543
8544 // ~(~0 << len), where len is the # of remaining elements to process
8545 movl(len, 0xFFFFFFFF);
8546 shlxl(len, len, tmp5);
8547 notl(len);
8548 kmovdl(mask2, len);
8549 movl(len, result);
8550
8551 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8552 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8553 ktestd(mask1, mask2);
8554 jcc(Assembler::carryClear, copy_tail);
8555
8556 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8557
8558 addptr(src, tmp5);
8559 addptr(src, tmp5);
8560 addptr(dst, tmp5);
8561 subl(len, tmp5);
8562
8563 bind(post_alignment);
8564 // end of alignment
8565
8566 movl(tmp5, len);
8567 andl(tmp5, (32 - 1)); // tail count (in chars)
8568 andl(len, ~(32 - 1)); // vector count (in chars)
8569 jccb(Assembler::zero, copy_loop_tail);
8570
8571 lea(src, Address(src, len, Address::times_2));
8572 lea(dst, Address(dst, len, Address::times_1));
8573 negptr(len);
8574
8575 bind(copy_32_loop);
8576 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
8577 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8578 kortestdl(mask1, mask1);
8579 jccb(Assembler::carryClear, reset_for_copy_tail);
8580
8581 // All elements in current processed chunk are valid candidates for
8582 // compression. Write a truncated byte elements to the memory.
8583 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8584 addptr(len, 32);
8585 jccb(Assembler::notZero, copy_32_loop);
8586
8587 bind(copy_loop_tail);
8588 // bail out when there is nothing to be done
8589 testl(tmp5, 0xFFFFFFFF);
8590 jcc(Assembler::zero, done);
8591
8592 movl(len, tmp5);
8593
8594 // ~(~0 << len), where len is the # of remaining elements to process
8595 movl(tmp5, 0xFFFFFFFF);
8596 shlxl(tmp5, tmp5, len);
8597 notl(tmp5);
8598
8599 kmovdl(mask2, tmp5);
8600
8601 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8602 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8603 ktestd(mask1, mask2);
8604 jcc(Assembler::carryClear, copy_tail);
8605
8606 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8607 jmp(done);
8608
8609 bind(reset_for_copy_tail);
8610 lea(src, Address(src, tmp5, Address::times_2));
8611 lea(dst, Address(dst, tmp5, Address::times_1));
8612 subptr(len, tmp5);
8613 jmp(copy_chars_loop);
8614
8615 bind(below_threshold);
8616 }
8617
8618 if (UseSSE42Intrinsics) {
8619 Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
8620
8621 // vectored compression
8622 testl(len, 0xfffffff8);
8623 jcc(Assembler::zero, copy_tail);
8624
8625 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
8626 movdl(tmp1Reg, tmp5);
8627 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
8628
8629 andl(len, 0xfffffff0);
8630 jccb(Assembler::zero, copy_16);
8631
8632 // compress 16 chars per iter
8633 pxor(tmp4Reg, tmp4Reg);
8634
8635 lea(src, Address(src, len, Address::times_2));
8636 lea(dst, Address(dst, len, Address::times_1));
8637 negptr(len);
8638
8639 bind(copy_32_loop);
8640 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
8641 por(tmp4Reg, tmp2Reg);
8642 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8643 por(tmp4Reg, tmp3Reg);
8644 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
8645 jccb(Assembler::notZero, reset_for_copy_tail);
8646 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
8647 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8648 addptr(len, 16);
8649 jccb(Assembler::notZero, copy_32_loop);
8650
8651 // compress next vector of 8 chars (if any)
8652 bind(copy_16);
8653 // len = 0
8654 testl(result, 0x00000008); // check if there's a block of 8 chars to compress
8655 jccb(Assembler::zero, copy_tail_sse);
8656
8657 pxor(tmp3Reg, tmp3Reg);
8658
8659 movdqu(tmp2Reg, Address(src, 0));
8660 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8661 jccb(Assembler::notZero, reset_for_copy_tail);
8662 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
8663 movq(Address(dst, 0), tmp2Reg);
8664 addptr(src, 16);
8665 addptr(dst, 8);
8666 jmpb(copy_tail_sse);
8667
8668 bind(reset_for_copy_tail);
8669 movl(tmp5, result);
8670 andl(tmp5, 0x0000000f);
8671 lea(src, Address(src, tmp5, Address::times_2));
8672 lea(dst, Address(dst, tmp5, Address::times_1));
8673 subptr(len, tmp5);
8674 jmpb(copy_chars_loop);
8675
8676 bind(copy_tail_sse);
8677 movl(len, result);
8678 andl(len, 0x00000007); // tail count (in chars)
8679 }
8680 // compress 1 char per iter
8681 bind(copy_tail);
8682 testl(len, len);
8683 jccb(Assembler::zero, done);
8684 lea(src, Address(src, len, Address::times_2));
8685 lea(dst, Address(dst, len, Address::times_1));
8686 negptr(len);
8687
8688 bind(copy_chars_loop);
8689 load_unsigned_short(tmp5, Address(src, len, Address::times_2));
8690 testl(tmp5, 0xff00); // check if Unicode char
8691 jccb(Assembler::notZero, reset_sp);
8692 movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte
8693 increment(len);
8694 jccb(Assembler::notZero, copy_chars_loop);
8695
8696 // add len then return (len will be zero if compress succeeded, otherwise negative)
8697 bind(reset_sp);
8698 addl(result, len);
8699
8700 bind(done);
8701 }
8702
8703 // Inflate byte[] array to char[].
8704 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8705 // @IntrinsicCandidate
8706 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8707 // for (int i = 0; i < len; i++) {
8708 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8709 // }
8710 // }
8711 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8712 XMMRegister tmp1, Register tmp2, KRegister mask) {
8713 Label copy_chars_loop, done, below_threshold, avx3_threshold;
8714 // rsi: src
8715 // rdi: dst
8716 // rdx: len
8717 // rcx: tmp2
8718
8719 // rsi holds start addr of source byte[] to be inflated
8720 // rdi holds start addr of destination char[]
8721 // rdx holds length
8722 assert_different_registers(src, dst, len, tmp2);
8723 movl(tmp2, len);
8724 if ((UseAVX > 2) && // AVX512
8725 VM_Version::supports_avx512vlbw() &&
8726 VM_Version::supports_bmi2()) {
8727
8728 Label copy_32_loop, copy_tail;
8729 Register tmp3_aliased = len;
8730
8731 // if length of the string is less than 16, handle it in an old fashioned way
8732 testl(len, -16);
8733 jcc(Assembler::zero, below_threshold);
8734
8735 testl(len, -1 * AVX3Threshold);
8736 jcc(Assembler::zero, avx3_threshold);
8737
8738 // In order to use only one arithmetic operation for the main loop we use
8739 // this pre-calculation
8740 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8741 andl(len, -32); // vector count
8742 jccb(Assembler::zero, copy_tail);
8743
8744 lea(src, Address(src, len, Address::times_1));
8745 lea(dst, Address(dst, len, Address::times_2));
8746 negptr(len);
8747
8748
8749 // inflate 32 chars per iter
8750 bind(copy_32_loop);
8751 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8752 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
8753 addptr(len, 32);
8754 jcc(Assembler::notZero, copy_32_loop);
8755
8756 bind(copy_tail);
8757 // bail out when there is nothing to be done
8758 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8759 jcc(Assembler::zero, done);
8760
8761 // ~(~0 << length), where length is the # of remaining elements to process
8762 movl(tmp3_aliased, -1);
8763 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8764 notl(tmp3_aliased);
8765 kmovdl(mask, tmp3_aliased);
8766 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8767 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8768
8769 jmp(done);
8770 bind(avx3_threshold);
8771 }
8772 if (UseSSE42Intrinsics) {
8773 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8774
8775 if (UseAVX > 1) {
8776 andl(tmp2, (16 - 1));
8777 andl(len, -16);
8778 jccb(Assembler::zero, copy_new_tail);
8779 } else {
8780 andl(tmp2, 0x00000007); // tail count (in chars)
8781 andl(len, 0xfffffff8); // vector count (in chars)
8782 jccb(Assembler::zero, copy_tail);
8783 }
8784
8785 // vectored inflation
8786 lea(src, Address(src, len, Address::times_1));
8787 lea(dst, Address(dst, len, Address::times_2));
8788 negptr(len);
8789
8790 if (UseAVX > 1) {
8791 bind(copy_16_loop);
8792 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8793 vmovdqu(Address(dst, len, Address::times_2), tmp1);
8794 addptr(len, 16);
8795 jcc(Assembler::notZero, copy_16_loop);
8796
8797 bind(below_threshold);
8798 bind(copy_new_tail);
8799 movl(len, tmp2);
8800 andl(tmp2, 0x00000007);
8801 andl(len, 0xFFFFFFF8);
8802 jccb(Assembler::zero, copy_tail);
8803
8804 pmovzxbw(tmp1, Address(src, 0));
8805 movdqu(Address(dst, 0), tmp1);
8806 addptr(src, 8);
8807 addptr(dst, 2 * 8);
8808
8809 jmp(copy_tail, true);
8810 }
8811
8812 // inflate 8 chars per iter
8813 bind(copy_8_loop);
8814 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
8815 movdqu(Address(dst, len, Address::times_2), tmp1);
8816 addptr(len, 8);
8817 jcc(Assembler::notZero, copy_8_loop);
8818
8819 bind(copy_tail);
8820 movl(len, tmp2);
8821
8822 cmpl(len, 4);
8823 jccb(Assembler::less, copy_bytes);
8824
8825 movdl(tmp1, Address(src, 0)); // load 4 byte chars
8826 pmovzxbw(tmp1, tmp1);
8827 movq(Address(dst, 0), tmp1);
8828 subptr(len, 4);
8829 addptr(src, 4);
8830 addptr(dst, 8);
8831
8832 bind(copy_bytes);
8833 } else {
8834 bind(below_threshold);
8835 }
8836
8837 testl(len, len);
8838 jccb(Assembler::zero, done);
8839 lea(src, Address(src, len, Address::times_1));
8840 lea(dst, Address(dst, len, Address::times_2));
8841 negptr(len);
8842
8843 // inflate 1 char per iter
8844 bind(copy_chars_loop);
8845 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
8846 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
8847 increment(len);
8848 jcc(Assembler::notZero, copy_chars_loop);
8849
8850 bind(done);
8851 }
8852
8853 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
8854 switch(type) {
8855 case T_BYTE:
8856 case T_BOOLEAN:
8857 evmovdqub(dst, kmask, src, merge, vector_len);
8858 break;
8859 case T_CHAR:
8860 case T_SHORT:
8861 evmovdquw(dst, kmask, src, merge, vector_len);
8862 break;
8863 case T_INT:
8864 case T_FLOAT:
8865 evmovdqul(dst, kmask, src, merge, vector_len);
8866 break;
8867 case T_LONG:
8868 case T_DOUBLE:
8869 evmovdquq(dst, kmask, src, merge, vector_len);
8870 break;
8871 default:
8872 fatal("Unexpected type argument %s", type2name(type));
8873 break;
8874 }
8875 }
8876
8877
8878 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
8879 switch(type) {
8880 case T_BYTE:
8881 case T_BOOLEAN:
8882 evmovdqub(dst, kmask, src, merge, vector_len);
8883 break;
8884 case T_CHAR:
8885 case T_SHORT:
8886 evmovdquw(dst, kmask, src, merge, vector_len);
8887 break;
8888 case T_INT:
8889 case T_FLOAT:
8890 evmovdqul(dst, kmask, src, merge, vector_len);
8891 break;
8892 case T_LONG:
8893 case T_DOUBLE:
8894 evmovdquq(dst, kmask, src, merge, vector_len);
8895 break;
8896 default:
8897 fatal("Unexpected type argument %s", type2name(type));
8898 break;
8899 }
8900 }
8901
8902 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
8903 switch(type) {
8904 case T_BYTE:
8905 case T_BOOLEAN:
8906 evmovdqub(dst, kmask, src, merge, vector_len);
8907 break;
8908 case T_CHAR:
8909 case T_SHORT:
8910 evmovdquw(dst, kmask, src, merge, vector_len);
8911 break;
8912 case T_INT:
8913 case T_FLOAT:
8914 evmovdqul(dst, kmask, src, merge, vector_len);
8915 break;
8916 case T_LONG:
8917 case T_DOUBLE:
8918 evmovdquq(dst, kmask, src, merge, vector_len);
8919 break;
8920 default:
8921 fatal("Unexpected type argument %s", type2name(type));
8922 break;
8923 }
8924 }
8925
8926 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8927 switch(masklen) {
8928 case 2:
8929 knotbl(dst, src);
8930 movl(rtmp, 3);
8931 kmovbl(ktmp, rtmp);
8932 kandbl(dst, ktmp, dst);
8933 break;
8934 case 4:
8935 knotbl(dst, src);
8936 movl(rtmp, 15);
8937 kmovbl(ktmp, rtmp);
8938 kandbl(dst, ktmp, dst);
8939 break;
8940 case 8:
8941 knotbl(dst, src);
8942 break;
8943 case 16:
8944 knotwl(dst, src);
8945 break;
8946 case 32:
8947 knotdl(dst, src);
8948 break;
8949 case 64:
8950 knotql(dst, src);
8951 break;
8952 default:
8953 fatal("Unexpected vector length %d", masklen);
8954 break;
8955 }
8956 }
8957
8958 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8959 switch(type) {
8960 case T_BOOLEAN:
8961 case T_BYTE:
8962 kandbl(dst, src1, src2);
8963 break;
8964 case T_CHAR:
8965 case T_SHORT:
8966 kandwl(dst, src1, src2);
8967 break;
8968 case T_INT:
8969 case T_FLOAT:
8970 kanddl(dst, src1, src2);
8971 break;
8972 case T_LONG:
8973 case T_DOUBLE:
8974 kandql(dst, src1, src2);
8975 break;
8976 default:
8977 fatal("Unexpected type argument %s", type2name(type));
8978 break;
8979 }
8980 }
8981
8982 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8983 switch(type) {
8984 case T_BOOLEAN:
8985 case T_BYTE:
8986 korbl(dst, src1, src2);
8987 break;
8988 case T_CHAR:
8989 case T_SHORT:
8990 korwl(dst, src1, src2);
8991 break;
8992 case T_INT:
8993 case T_FLOAT:
8994 kordl(dst, src1, src2);
8995 break;
8996 case T_LONG:
8997 case T_DOUBLE:
8998 korql(dst, src1, src2);
8999 break;
9000 default:
9001 fatal("Unexpected type argument %s", type2name(type));
9002 break;
9003 }
9004 }
9005
9006 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9007 switch(type) {
9008 case T_BOOLEAN:
9009 case T_BYTE:
9010 kxorbl(dst, src1, src2);
9011 break;
9012 case T_CHAR:
9013 case T_SHORT:
9014 kxorwl(dst, src1, src2);
9015 break;
9016 case T_INT:
9017 case T_FLOAT:
9018 kxordl(dst, src1, src2);
9019 break;
9020 case T_LONG:
9021 case T_DOUBLE:
9022 kxorql(dst, src1, src2);
9023 break;
9024 default:
9025 fatal("Unexpected type argument %s", type2name(type));
9026 break;
9027 }
9028 }
9029
9030 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9031 switch(type) {
9032 case T_BOOLEAN:
9033 case T_BYTE:
9034 evpermb(dst, mask, nds, src, merge, vector_len); break;
9035 case T_CHAR:
9036 case T_SHORT:
9037 evpermw(dst, mask, nds, src, merge, vector_len); break;
9038 case T_INT:
9039 case T_FLOAT:
9040 evpermd(dst, mask, nds, src, merge, vector_len); break;
9041 case T_LONG:
9042 case T_DOUBLE:
9043 evpermq(dst, mask, nds, src, merge, vector_len); break;
9044 default:
9045 fatal("Unexpected type argument %s", type2name(type)); break;
9046 }
9047 }
9048
9049 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9050 switch(type) {
9051 case T_BOOLEAN:
9052 case T_BYTE:
9053 evpermb(dst, mask, nds, src, merge, vector_len); break;
9054 case T_CHAR:
9055 case T_SHORT:
9056 evpermw(dst, mask, nds, src, merge, vector_len); break;
9057 case T_INT:
9058 case T_FLOAT:
9059 evpermd(dst, mask, nds, src, merge, vector_len); break;
9060 case T_LONG:
9061 case T_DOUBLE:
9062 evpermq(dst, mask, nds, src, merge, vector_len); break;
9063 default:
9064 fatal("Unexpected type argument %s", type2name(type)); break;
9065 }
9066 }
9067
9068 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9069 switch(type) {
9070 case T_BYTE:
9071 evpminub(dst, mask, nds, src, merge, vector_len); break;
9072 case T_SHORT:
9073 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9074 case T_INT:
9075 evpminud(dst, mask, nds, src, merge, vector_len); break;
9076 case T_LONG:
9077 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9078 default:
9079 fatal("Unexpected type argument %s", type2name(type)); break;
9080 }
9081 }
9082
9083 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9084 switch(type) {
9085 case T_BYTE:
9086 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9087 case T_SHORT:
9088 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9089 case T_INT:
9090 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9091 case T_LONG:
9092 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9093 default:
9094 fatal("Unexpected type argument %s", type2name(type)); break;
9095 }
9096 }
9097
9098 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9099 switch(type) {
9100 case T_BYTE:
9101 evpminub(dst, mask, nds, src, merge, vector_len); break;
9102 case T_SHORT:
9103 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9104 case T_INT:
9105 evpminud(dst, mask, nds, src, merge, vector_len); break;
9106 case T_LONG:
9107 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9108 default:
9109 fatal("Unexpected type argument %s", type2name(type)); break;
9110 }
9111 }
9112
9113 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9114 switch(type) {
9115 case T_BYTE:
9116 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9117 case T_SHORT:
9118 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9119 case T_INT:
9120 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9121 case T_LONG:
9122 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9123 default:
9124 fatal("Unexpected type argument %s", type2name(type)); break;
9125 }
9126 }
9127
9128 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9129 switch(type) {
9130 case T_BYTE:
9131 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9132 case T_SHORT:
9133 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9134 case T_INT:
9135 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9136 case T_LONG:
9137 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9138 case T_FLOAT:
9139 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9140 case T_DOUBLE:
9141 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9142 default:
9143 fatal("Unexpected type argument %s", type2name(type)); break;
9144 }
9145 }
9146
9147 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9148 switch(type) {
9149 case T_BYTE:
9150 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9151 case T_SHORT:
9152 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9153 case T_INT:
9154 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9155 case T_LONG:
9156 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9157 case T_FLOAT:
9158 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9159 case T_DOUBLE:
9160 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9161 default:
9162 fatal("Unexpected type argument %s", type2name(type)); break;
9163 }
9164 }
9165
9166 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9167 switch(type) {
9168 case T_BYTE:
9169 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9170 case T_SHORT:
9171 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9172 case T_INT:
9173 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9174 case T_LONG:
9175 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9176 case T_FLOAT:
9177 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9178 case T_DOUBLE:
9179 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9180 default:
9181 fatal("Unexpected type argument %s", type2name(type)); break;
9182 }
9183 }
9184
9185 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9186 switch(type) {
9187 case T_BYTE:
9188 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9189 case T_SHORT:
9190 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9191 case T_INT:
9192 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9193 case T_LONG:
9194 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9195 case T_FLOAT:
9196 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9197 case T_DOUBLE:
9198 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9199 default:
9200 fatal("Unexpected type argument %s", type2name(type)); break;
9201 }
9202 }
9203
9204 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9205 switch(type) {
9206 case T_INT:
9207 evpxord(dst, mask, nds, src, merge, vector_len); break;
9208 case T_LONG:
9209 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9210 default:
9211 fatal("Unexpected type argument %s", type2name(type)); break;
9212 }
9213 }
9214
9215 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9216 switch(type) {
9217 case T_INT:
9218 evpxord(dst, mask, nds, src, merge, vector_len); break;
9219 case T_LONG:
9220 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9221 default:
9222 fatal("Unexpected type argument %s", type2name(type)); break;
9223 }
9224 }
9225
9226 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9227 switch(type) {
9228 case T_INT:
9229 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9230 case T_LONG:
9231 evporq(dst, mask, nds, src, merge, vector_len); break;
9232 default:
9233 fatal("Unexpected type argument %s", type2name(type)); break;
9234 }
9235 }
9236
9237 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9238 switch(type) {
9239 case T_INT:
9240 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9241 case T_LONG:
9242 evporq(dst, mask, nds, src, merge, vector_len); break;
9243 default:
9244 fatal("Unexpected type argument %s", type2name(type)); break;
9245 }
9246 }
9247
9248 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9249 switch(type) {
9250 case T_INT:
9251 evpandd(dst, mask, nds, src, merge, vector_len); break;
9252 case T_LONG:
9253 evpandq(dst, mask, nds, src, merge, vector_len); break;
9254 default:
9255 fatal("Unexpected type argument %s", type2name(type)); break;
9256 }
9257 }
9258
9259 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9260 switch(type) {
9261 case T_INT:
9262 evpandd(dst, mask, nds, src, merge, vector_len); break;
9263 case T_LONG:
9264 evpandq(dst, mask, nds, src, merge, vector_len); break;
9265 default:
9266 fatal("Unexpected type argument %s", type2name(type)); break;
9267 }
9268 }
9269
9270 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
9271 switch(masklen) {
9272 case 8:
9273 kortestbl(src1, src2);
9274 break;
9275 case 16:
9276 kortestwl(src1, src2);
9277 break;
9278 case 32:
9279 kortestdl(src1, src2);
9280 break;
9281 case 64:
9282 kortestql(src1, src2);
9283 break;
9284 default:
9285 fatal("Unexpected mask length %d", masklen);
9286 break;
9287 }
9288 }
9289
9290
9291 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9292 switch(masklen) {
9293 case 8:
9294 ktestbl(src1, src2);
9295 break;
9296 case 16:
9297 ktestwl(src1, src2);
9298 break;
9299 case 32:
9300 ktestdl(src1, src2);
9301 break;
9302 case 64:
9303 ktestql(src1, src2);
9304 break;
9305 default:
9306 fatal("Unexpected mask length %d", masklen);
9307 break;
9308 }
9309 }
9310
9311 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9312 switch(type) {
9313 case T_INT:
9314 evprold(dst, mask, src, shift, merge, vlen_enc); break;
9315 case T_LONG:
9316 evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9317 default:
9318 fatal("Unexpected type argument %s", type2name(type)); break;
9319 break;
9320 }
9321 }
9322
9323 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9324 switch(type) {
9325 case T_INT:
9326 evprord(dst, mask, src, shift, merge, vlen_enc); break;
9327 case T_LONG:
9328 evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9329 default:
9330 fatal("Unexpected type argument %s", type2name(type)); break;
9331 }
9332 }
9333
9334 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9335 switch(type) {
9336 case T_INT:
9337 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9338 case T_LONG:
9339 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9340 default:
9341 fatal("Unexpected type argument %s", type2name(type)); break;
9342 }
9343 }
9344
9345 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9346 switch(type) {
9347 case T_INT:
9348 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9349 case T_LONG:
9350 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9351 default:
9352 fatal("Unexpected type argument %s", type2name(type)); break;
9353 }
9354 }
9355
9356 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9357 assert(rscratch != noreg || always_reachable(src), "missing");
9358
9359 if (reachable(src)) {
9360 evpandq(dst, nds, as_Address(src), vector_len);
9361 } else {
9362 lea(rscratch, src);
9363 evpandq(dst, nds, Address(rscratch, 0), vector_len);
9364 }
9365 }
9366
9367 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9368 assert(rscratch != noreg || always_reachable(src), "missing");
9369
9370 if (reachable(src)) {
9371 Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9372 } else {
9373 lea(rscratch, src);
9374 Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9375 }
9376 }
9377
9378 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9379 assert(rscratch != noreg || always_reachable(src), "missing");
9380
9381 if (reachable(src)) {
9382 evporq(dst, nds, as_Address(src), vector_len);
9383 } else {
9384 lea(rscratch, src);
9385 evporq(dst, nds, Address(rscratch, 0), vector_len);
9386 }
9387 }
9388
9389 void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9390 assert(rscratch != noreg || always_reachable(src), "missing");
9391
9392 if (reachable(src)) {
9393 vpshufb(dst, nds, as_Address(src), vector_len);
9394 } else {
9395 lea(rscratch, src);
9396 vpshufb(dst, nds, Address(rscratch, 0), vector_len);
9397 }
9398 }
9399
9400 void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9401 assert(rscratch != noreg || always_reachable(src), "missing");
9402
9403 if (reachable(src)) {
9404 Assembler::vpor(dst, nds, as_Address(src), vector_len);
9405 } else {
9406 lea(rscratch, src);
9407 Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len);
9408 }
9409 }
9410
9411 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
9412 assert(rscratch != noreg || always_reachable(src3), "missing");
9413
9414 if (reachable(src3)) {
9415 vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
9416 } else {
9417 lea(rscratch, src3);
9418 vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
9419 }
9420 }
9421
9422 #if COMPILER2_OR_JVMCI
9423
9424 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9425 Register length, Register temp, int vec_enc) {
9426 // Computing mask for predicated vector store.
9427 movptr(temp, -1);
9428 bzhiq(temp, temp, length);
9429 kmov(mask, temp);
9430 evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9431 }
9432
9433 // Set memory operation for length "less than" 64 bytes.
9434 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9435 XMMRegister xmm, KRegister mask, Register length,
9436 Register temp, bool use64byteVector) {
9437 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9438 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9439 if (!use64byteVector) {
9440 fill32(dst, disp, xmm);
9441 subptr(length, 32 >> shift);
9442 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9443 } else {
9444 assert(MaxVectorSize == 64, "vector length != 64");
9445 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9446 }
9447 }
9448
9449
9450 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9451 XMMRegister xmm, KRegister mask, Register length,
9452 Register temp) {
9453 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9454 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9455 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9456 }
9457
9458
9459 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9460 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9461 vmovdqu(dst, xmm);
9462 }
9463
9464 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9465 fill32(Address(dst, disp), xmm);
9466 }
9467
9468 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9469 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9470 if (!use64byteVector) {
9471 fill32(dst, xmm);
9472 fill32(dst.plus_disp(32), xmm);
9473 } else {
9474 evmovdquq(dst, xmm, Assembler::AVX_512bit);
9475 }
9476 }
9477
9478 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9479 fill64(Address(dst, disp), xmm, use64byteVector);
9480 }
9481
9482 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9483 Register count, Register rtmp, XMMRegister xtmp) {
9484 Label L_exit;
9485 Label L_fill_start;
9486 Label L_fill_64_bytes;
9487 Label L_fill_96_bytes;
9488 Label L_fill_128_bytes;
9489 Label L_fill_128_bytes_loop;
9490 Label L_fill_128_loop_header;
9491 Label L_fill_128_bytes_loop_header;
9492 Label L_fill_128_bytes_loop_pre_header;
9493 Label L_fill_zmm_sequence;
9494
9495 int shift = -1;
9496 switch(type) {
9497 case T_BYTE: shift = 0;
9498 break;
9499 case T_SHORT: shift = 1;
9500 break;
9501 case T_INT: shift = 2;
9502 break;
9503 /* Uncomment when LONG fill stubs are supported.
9504 case T_LONG: shift = 3;
9505 break;
9506 */
9507 default:
9508 fatal("Unhandled type: %s\n", type2name(type));
9509 }
9510
9511 if ((CopyAVX3Threshold != 0) || (MaxVectorSize == 32)) {
9512
9513 if (MaxVectorSize == 64) {
9514 cmpq(count, CopyAVX3Threshold >> shift);
9515 jcc(Assembler::greater, L_fill_zmm_sequence);
9516 }
9517
9518 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
9519
9520 bind(L_fill_start);
9521
9522 cmpq(count, 32 >> shift);
9523 jccb(Assembler::greater, L_fill_64_bytes);
9524 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
9525 jmp(L_exit);
9526
9527 bind(L_fill_64_bytes);
9528 cmpq(count, 64 >> shift);
9529 jccb(Assembler::greater, L_fill_96_bytes);
9530 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
9531 jmp(L_exit);
9532
9533 bind(L_fill_96_bytes);
9534 cmpq(count, 96 >> shift);
9535 jccb(Assembler::greater, L_fill_128_bytes);
9536 fill64(to, 0, xtmp);
9537 subq(count, 64 >> shift);
9538 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
9539 jmp(L_exit);
9540
9541 bind(L_fill_128_bytes);
9542 cmpq(count, 128 >> shift);
9543 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
9544 fill64(to, 0, xtmp);
9545 fill32(to, 64, xtmp);
9546 subq(count, 96 >> shift);
9547 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
9548 jmp(L_exit);
9549
9550 bind(L_fill_128_bytes_loop_pre_header);
9551 {
9552 mov(rtmp, to);
9553 andq(rtmp, 31);
9554 jccb(Assembler::zero, L_fill_128_bytes_loop_header);
9555 negq(rtmp);
9556 addq(rtmp, 32);
9557 mov64(r8, -1L);
9558 bzhiq(r8, r8, rtmp);
9559 kmovql(k2, r8);
9560 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
9561 addq(to, rtmp);
9562 shrq(rtmp, shift);
9563 subq(count, rtmp);
9564 }
9565
9566 cmpq(count, 128 >> shift);
9567 jcc(Assembler::less, L_fill_start);
9568
9569 bind(L_fill_128_bytes_loop_header);
9570 subq(count, 128 >> shift);
9571
9572 align32();
9573 bind(L_fill_128_bytes_loop);
9574 fill64(to, 0, xtmp);
9575 fill64(to, 64, xtmp);
9576 addq(to, 128);
9577 subq(count, 128 >> shift);
9578 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9579
9580 addq(count, 128 >> shift);
9581 jcc(Assembler::zero, L_exit);
9582 jmp(L_fill_start);
9583 }
9584
9585 if (MaxVectorSize == 64) {
9586 // Sequence using 64 byte ZMM register.
9587 Label L_fill_128_bytes_zmm;
9588 Label L_fill_192_bytes_zmm;
9589 Label L_fill_192_bytes_loop_zmm;
9590 Label L_fill_192_bytes_loop_header_zmm;
9591 Label L_fill_192_bytes_loop_pre_header_zmm;
9592 Label L_fill_start_zmm_sequence;
9593
9594 bind(L_fill_zmm_sequence);
9595 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9596
9597 bind(L_fill_start_zmm_sequence);
9598 cmpq(count, 64 >> shift);
9599 jccb(Assembler::greater, L_fill_128_bytes_zmm);
9600 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9601 jmp(L_exit);
9602
9603 bind(L_fill_128_bytes_zmm);
9604 cmpq(count, 128 >> shift);
9605 jccb(Assembler::greater, L_fill_192_bytes_zmm);
9606 fill64(to, 0, xtmp, true);
9607 subq(count, 64 >> shift);
9608 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9609 jmp(L_exit);
9610
9611 bind(L_fill_192_bytes_zmm);
9612 cmpq(count, 192 >> shift);
9613 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9614 fill64(to, 0, xtmp, true);
9615 fill64(to, 64, xtmp, true);
9616 subq(count, 128 >> shift);
9617 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9618 jmp(L_exit);
9619
9620 bind(L_fill_192_bytes_loop_pre_header_zmm);
9621 {
9622 movq(rtmp, to);
9623 andq(rtmp, 63);
9624 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9625 negq(rtmp);
9626 addq(rtmp, 64);
9627 mov64(r8, -1L);
9628 bzhiq(r8, r8, rtmp);
9629 kmovql(k2, r8);
9630 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9631 addq(to, rtmp);
9632 shrq(rtmp, shift);
9633 subq(count, rtmp);
9634 }
9635
9636 cmpq(count, 192 >> shift);
9637 jcc(Assembler::less, L_fill_start_zmm_sequence);
9638
9639 bind(L_fill_192_bytes_loop_header_zmm);
9640 subq(count, 192 >> shift);
9641
9642 align32();
9643 bind(L_fill_192_bytes_loop_zmm);
9644 fill64(to, 0, xtmp, true);
9645 fill64(to, 64, xtmp, true);
9646 fill64(to, 128, xtmp, true);
9647 addq(to, 192);
9648 subq(count, 192 >> shift);
9649 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9650
9651 addq(count, 192 >> shift);
9652 jcc(Assembler::zero, L_exit);
9653 jmp(L_fill_start_zmm_sequence);
9654 }
9655 bind(L_exit);
9656 }
9657 #endif //COMPILER2_OR_JVMCI
9658
9659
9660 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9661 Label done;
9662 cvttss2sil(dst, src);
9663 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9664 cmpl(dst, 0x80000000); // float_sign_flip
9665 jccb(Assembler::notEqual, done);
9666 subptr(rsp, 8);
9667 movflt(Address(rsp, 0), src);
9668 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9669 pop(dst);
9670 bind(done);
9671 }
9672
9673 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9674 Label done;
9675 cvttsd2sil(dst, src);
9676 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9677 cmpl(dst, 0x80000000); // float_sign_flip
9678 jccb(Assembler::notEqual, done);
9679 subptr(rsp, 8);
9680 movdbl(Address(rsp, 0), src);
9681 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9682 pop(dst);
9683 bind(done);
9684 }
9685
9686 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9687 Label done;
9688 cvttss2siq(dst, src);
9689 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9690 jccb(Assembler::notEqual, done);
9691 subptr(rsp, 8);
9692 movflt(Address(rsp, 0), src);
9693 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9694 pop(dst);
9695 bind(done);
9696 }
9697
9698 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9699 // Following code is line by line assembly translation rounding algorithm.
9700 // Please refer to java.lang.Math.round(float) algorithm for details.
9701 const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
9702 const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
9703 const int32_t FloatConsts_EXP_BIAS = 127;
9704 const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
9705 const int32_t MINUS_32 = 0xFFFFFFE0;
9706 Label L_special_case, L_block1, L_exit;
9707 movl(rtmp, FloatConsts_EXP_BIT_MASK);
9708 movdl(dst, src);
9709 andl(dst, rtmp);
9710 sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
9711 movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
9712 subl(rtmp, dst);
9713 movl(rcx, rtmp);
9714 movl(dst, MINUS_32);
9715 testl(rtmp, dst);
9716 jccb(Assembler::notEqual, L_special_case);
9717 movdl(dst, src);
9718 andl(dst, FloatConsts_SIGNIF_BIT_MASK);
9719 orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
9720 movdl(rtmp, src);
9721 testl(rtmp, rtmp);
9722 jccb(Assembler::greaterEqual, L_block1);
9723 negl(dst);
9724 bind(L_block1);
9725 sarl(dst);
9726 addl(dst, 0x1);
9727 sarl(dst, 0x1);
9728 jmp(L_exit);
9729 bind(L_special_case);
9730 convert_f2i(dst, src);
9731 bind(L_exit);
9732 }
9733
9734 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9735 // Following code is line by line assembly translation rounding algorithm.
9736 // Please refer to java.lang.Math.round(double) algorithm for details.
9737 const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
9738 const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
9739 const int64_t DoubleConsts_EXP_BIAS = 1023;
9740 const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
9741 const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
9742 Label L_special_case, L_block1, L_exit;
9743 mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
9744 movq(dst, src);
9745 andq(dst, rtmp);
9746 sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
9747 mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
9748 subq(rtmp, dst);
9749 movq(rcx, rtmp);
9750 mov64(dst, MINUS_64);
9751 testq(rtmp, dst);
9752 jccb(Assembler::notEqual, L_special_case);
9753 movq(dst, src);
9754 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
9755 andq(dst, rtmp);
9756 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
9757 orq(dst, rtmp);
9758 movq(rtmp, src);
9759 testq(rtmp, rtmp);
9760 jccb(Assembler::greaterEqual, L_block1);
9761 negq(dst);
9762 bind(L_block1);
9763 sarq(dst);
9764 addq(dst, 0x1);
9765 sarq(dst, 0x1);
9766 jmp(L_exit);
9767 bind(L_special_case);
9768 convert_d2l(dst, src);
9769 bind(L_exit);
9770 }
9771
9772 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9773 Label done;
9774 cvttsd2siq(dst, src);
9775 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9776 jccb(Assembler::notEqual, done);
9777 subptr(rsp, 8);
9778 movdbl(Address(rsp, 0), src);
9779 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9780 pop(dst);
9781 bind(done);
9782 }
9783
9784 void MacroAssembler::cache_wb(Address line)
9785 {
9786 // 64 bit cpus always support clflush
9787 assert(VM_Version::supports_clflush(), "clflush should be available");
9788 bool optimized = VM_Version::supports_clflushopt();
9789 bool no_evict = VM_Version::supports_clwb();
9790
9791 // prefer clwb (writeback without evict) otherwise
9792 // prefer clflushopt (potentially parallel writeback with evict)
9793 // otherwise fallback on clflush (serial writeback with evict)
9794
9795 if (optimized) {
9796 if (no_evict) {
9797 clwb(line);
9798 } else {
9799 clflushopt(line);
9800 }
9801 } else {
9802 // no need for fence when using CLFLUSH
9803 clflush(line);
9804 }
9805 }
9806
9807 void MacroAssembler::cache_wbsync(bool is_pre)
9808 {
9809 assert(VM_Version::supports_clflush(), "clflush should be available");
9810 bool optimized = VM_Version::supports_clflushopt();
9811 bool no_evict = VM_Version::supports_clwb();
9812
9813 // pick the correct implementation
9814
9815 if (!is_pre && (optimized || no_evict)) {
9816 // need an sfence for post flush when using clflushopt or clwb
9817 // otherwise no no need for any synchroniaztion
9818
9819 sfence();
9820 }
9821 }
9822
9823 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9824 switch (cond) {
9825 // Note some conditions are synonyms for others
9826 case Assembler::zero: return Assembler::notZero;
9827 case Assembler::notZero: return Assembler::zero;
9828 case Assembler::less: return Assembler::greaterEqual;
9829 case Assembler::lessEqual: return Assembler::greater;
9830 case Assembler::greater: return Assembler::lessEqual;
9831 case Assembler::greaterEqual: return Assembler::less;
9832 case Assembler::below: return Assembler::aboveEqual;
9833 case Assembler::belowEqual: return Assembler::above;
9834 case Assembler::above: return Assembler::belowEqual;
9835 case Assembler::aboveEqual: return Assembler::below;
9836 case Assembler::overflow: return Assembler::noOverflow;
9837 case Assembler::noOverflow: return Assembler::overflow;
9838 case Assembler::negative: return Assembler::positive;
9839 case Assembler::positive: return Assembler::negative;
9840 case Assembler::parity: return Assembler::noParity;
9841 case Assembler::noParity: return Assembler::parity;
9842 }
9843 ShouldNotReachHere(); return Assembler::overflow;
9844 }
9845
9846 // This is simply a call to Thread::current()
9847 void MacroAssembler::get_thread_slow(Register thread) {
9848 if (thread != rax) {
9849 push(rax);
9850 }
9851 push(rdi);
9852 push(rsi);
9853 push(rdx);
9854 push(rcx);
9855 push(r8);
9856 push(r9);
9857 push(r10);
9858 push(r11);
9859
9860 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9861
9862 pop(r11);
9863 pop(r10);
9864 pop(r9);
9865 pop(r8);
9866 pop(rcx);
9867 pop(rdx);
9868 pop(rsi);
9869 pop(rdi);
9870 if (thread != rax) {
9871 mov(thread, rax);
9872 pop(rax);
9873 }
9874 }
9875
9876 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
9877 Label L_stack_ok;
9878 if (bias == 0) {
9879 testptr(sp, 2 * wordSize - 1);
9880 } else {
9881 // lea(tmp, Address(rsp, bias);
9882 mov(tmp, sp);
9883 addptr(tmp, bias);
9884 testptr(tmp, 2 * wordSize - 1);
9885 }
9886 jcc(Assembler::equal, L_stack_ok);
9887 block_comment(msg);
9888 stop(msg);
9889 bind(L_stack_ok);
9890 }
9891
9892 // Implements fast-locking.
9893 //
9894 // obj: the object to be locked
9895 // reg_rax: rax
9896 // thread: the thread which attempts to lock obj
9897 // tmp: a temporary register
9898 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register reg_rax, Register tmp, Label& slow) {
9899 Register thread = r15_thread;
9900
9901 assert(reg_rax == rax, "");
9902 assert_different_registers(basic_lock, obj, reg_rax, thread, tmp);
9903
9904 Label push;
9905 const Register top = tmp;
9906
9907 // Preload the markWord. It is important that this is the first
9908 // instruction emitted as it is part of C1's null check semantics.
9909 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
9910
9911 if (UseObjectMonitorTable) {
9912 // Clear cache in case fast locking succeeds or we need to take the slow-path.
9913 movptr(Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))), 0);
9914 }
9915
9916 if (DiagnoseSyncOnValueBasedClasses != 0) {
9917 load_klass(tmp, obj, rscratch1);
9918 testb(Address(tmp, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
9919 jcc(Assembler::notZero, slow);
9920 }
9921
9922 // Load top.
9923 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9924
9925 // Check if the lock-stack is full.
9926 cmpl(top, LockStack::end_offset());
9927 jcc(Assembler::greaterEqual, slow);
9928
9929 // Check for recursion.
9930 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
9931 jcc(Assembler::equal, push);
9932
9933 // Check header for monitor (0b10).
9934 testptr(reg_rax, markWord::monitor_value);
9935 jcc(Assembler::notZero, slow);
9936
9937 // Try to lock. Transition lock bits 0b01 => 0b00
9938 movptr(tmp, reg_rax);
9939 andptr(tmp, ~(int32_t)markWord::unlocked_value);
9940 orptr(reg_rax, markWord::unlocked_value);
9941 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
9942 jcc(Assembler::notEqual, slow);
9943
9944 // Restore top, CAS clobbers register.
9945 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9946
9947 bind(push);
9948 // After successful lock, push object on lock-stack.
9949 movptr(Address(thread, top), obj);
9950 incrementl(top, oopSize);
9951 movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
9952 }
9953
9954 // Implements fast-unlocking.
9955 //
9956 // obj: the object to be unlocked
9957 // reg_rax: rax
9958 // thread: the thread
9959 // tmp: a temporary register
9960 void MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register tmp, Label& slow) {
9961 Register thread = r15_thread;
9962
9963 assert(reg_rax == rax, "");
9964 assert_different_registers(obj, reg_rax, thread, tmp);
9965
9966 Label unlocked, push_and_slow;
9967 const Register top = tmp;
9968
9969 // Check if obj is top of lock-stack.
9970 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9971 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
9972 jcc(Assembler::notEqual, slow);
9973
9974 // Pop lock-stack.
9975 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
9976 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
9977
9978 // Check if recursive.
9979 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
9980 jcc(Assembler::equal, unlocked);
9981
9982 // Not recursive. Check header for monitor (0b10).
9983 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
9984 testptr(reg_rax, markWord::monitor_value);
9985 jcc(Assembler::notZero, push_and_slow);
9986
9987 #ifdef ASSERT
9988 // Check header not unlocked (0b01).
9989 Label not_unlocked;
9990 testptr(reg_rax, markWord::unlocked_value);
9991 jcc(Assembler::zero, not_unlocked);
9992 stop("fast_unlock already unlocked");
9993 bind(not_unlocked);
9994 #endif
9995
9996 // Try to unlock. Transition lock bits 0b00 => 0b01
9997 movptr(tmp, reg_rax);
9998 orptr(tmp, markWord::unlocked_value);
9999 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10000 jcc(Assembler::equal, unlocked);
10001
10002 bind(push_and_slow);
10003 // Restore lock-stack and handle the unlock in runtime.
10004 #ifdef ASSERT
10005 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10006 movptr(Address(thread, top), obj);
10007 #endif
10008 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10009 jmp(slow);
10010
10011 bind(unlocked);
10012 }
10013
10014 // Saves legacy GPRs state on stack.
10015 void MacroAssembler::save_legacy_gprs() {
10016 subq(rsp, 16 * wordSize);
10017 movq(Address(rsp, 15 * wordSize), rax);
10018 movq(Address(rsp, 14 * wordSize), rcx);
10019 movq(Address(rsp, 13 * wordSize), rdx);
10020 movq(Address(rsp, 12 * wordSize), rbx);
10021 movq(Address(rsp, 10 * wordSize), rbp);
10022 movq(Address(rsp, 9 * wordSize), rsi);
10023 movq(Address(rsp, 8 * wordSize), rdi);
10024 movq(Address(rsp, 7 * wordSize), r8);
10025 movq(Address(rsp, 6 * wordSize), r9);
10026 movq(Address(rsp, 5 * wordSize), r10);
10027 movq(Address(rsp, 4 * wordSize), r11);
10028 movq(Address(rsp, 3 * wordSize), r12);
10029 movq(Address(rsp, 2 * wordSize), r13);
10030 movq(Address(rsp, wordSize), r14);
10031 movq(Address(rsp, 0), r15);
10032 }
10033
10034 // Resotres back legacy GPRs state from stack.
10035 void MacroAssembler::restore_legacy_gprs() {
10036 movq(r15, Address(rsp, 0));
10037 movq(r14, Address(rsp, wordSize));
10038 movq(r13, Address(rsp, 2 * wordSize));
10039 movq(r12, Address(rsp, 3 * wordSize));
10040 movq(r11, Address(rsp, 4 * wordSize));
10041 movq(r10, Address(rsp, 5 * wordSize));
10042 movq(r9, Address(rsp, 6 * wordSize));
10043 movq(r8, Address(rsp, 7 * wordSize));
10044 movq(rdi, Address(rsp, 8 * wordSize));
10045 movq(rsi, Address(rsp, 9 * wordSize));
10046 movq(rbp, Address(rsp, 10 * wordSize));
10047 movq(rbx, Address(rsp, 12 * wordSize));
10048 movq(rdx, Address(rsp, 13 * wordSize));
10049 movq(rcx, Address(rsp, 14 * wordSize));
10050 movq(rax, Address(rsp, 15 * wordSize));
10051 addq(rsp, 16 * wordSize);
10052 }
10053
10054 void MacroAssembler::load_aotrc_address(Register reg, address a) {
10055 #if INCLUDE_CDS
10056 assert(AOTRuntimeConstants::contains(a), "address out of range for data area");
10057 if (AOTCodeCache::is_on_for_dump()) {
10058 // all aotrc field addresses should be registered in the AOTCodeCache address table
10059 lea(reg, ExternalAddress(a));
10060 } else {
10061 mov64(reg, (uint64_t)a);
10062 }
10063 #else
10064 ShouldNotReachHere();
10065 #endif
10066 }
10067
10068 void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) {
10069 if (VM_Version::supports_apx_f()) {
10070 esetzucc(comparison, dst);
10071 } else {
10072 setb(comparison, dst);
10073 movzbl(dst, dst);
10074 }
10075 }