1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "code/aotCodeCache.hpp"
28 #include "code/compiledIC.hpp"
29 #include "compiler/compiler_globals.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "ci/ciInlineKlass.hpp"
32 #include "crc32c.h"
33 #include "gc/shared/barrierSet.hpp"
34 #include "gc/shared/barrierSetAssembler.hpp"
35 #include "gc/shared/collectedHeap.inline.hpp"
36 #include "gc/shared/tlab_globals.hpp"
37 #include "interpreter/bytecodeHistogram.hpp"
38 #include "interpreter/interpreter.hpp"
39 #include "interpreter/interpreterRuntime.hpp"
40 #include "jvm.h"
41 #include "memory/resourceArea.hpp"
42 #include "memory/universe.hpp"
43 #include "oops/accessDecorators.hpp"
44 #include "oops/compressedKlass.inline.hpp"
45 #include "oops/compressedOops.inline.hpp"
46 #include "oops/klass.inline.hpp"
47 #include "oops/resolvedFieldEntry.hpp"
48 #include "prims/methodHandles.hpp"
49 #include "runtime/arguments.hpp"
50 #include "runtime/continuation.hpp"
51 #include "runtime/interfaceSupport.inline.hpp"
52 #include "runtime/javaThread.hpp"
53 #include "runtime/jniHandles.hpp"
54 #include "runtime/objectMonitor.hpp"
55 #include "runtime/os.hpp"
56 #include "runtime/safepoint.hpp"
57 #include "runtime/safepointMechanism.hpp"
58 #include "runtime/sharedRuntime.hpp"
59 #include "runtime/signature_cc.hpp"
60 #include "runtime/stubRoutines.hpp"
61 #include "utilities/checkedCast.hpp"
62 #include "utilities/macros.hpp"
63 #include "vmreg_x86.inline.hpp"
64 #ifdef COMPILER2
65 #include "opto/output.hpp"
66 #endif
67
68 #ifdef PRODUCT
69 #define BLOCK_COMMENT(str) /* nothing */
70 #define STOP(error) stop(error)
71 #else
72 #define BLOCK_COMMENT(str) block_comment(str)
73 #define STOP(error) block_comment(error); stop(error)
74 #endif
75
76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
77
78 #ifdef ASSERT
79 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
80 #endif
81
82 static const Assembler::Condition reverse[] = {
83 Assembler::noOverflow /* overflow = 0x0 */ ,
84 Assembler::overflow /* noOverflow = 0x1 */ ,
85 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
86 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
87 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
88 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
89 Assembler::above /* belowEqual = 0x6 */ ,
90 Assembler::belowEqual /* above = 0x7 */ ,
91 Assembler::positive /* negative = 0x8 */ ,
92 Assembler::negative /* positive = 0x9 */ ,
93 Assembler::noParity /* parity = 0xa */ ,
94 Assembler::parity /* noParity = 0xb */ ,
95 Assembler::greaterEqual /* less = 0xc */ ,
96 Assembler::less /* greaterEqual = 0xd */ ,
97 Assembler::greater /* lessEqual = 0xe */ ,
98 Assembler::lessEqual /* greater = 0xf, */
99
100 };
101
102
103 // Implementation of MacroAssembler
104
105 Address MacroAssembler::as_Address(AddressLiteral adr) {
106 // amd64 always does this as a pc-rel
107 // we can be absolute or disp based on the instruction type
108 // jmp/call are displacements others are absolute
109 assert(!adr.is_lval(), "must be rval");
110 assert(reachable(adr), "must be");
111 return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
112
113 }
114
115 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
116 AddressLiteral base = adr.base();
117 lea(rscratch, base);
118 Address index = adr.index();
119 assert(index._disp == 0, "must not have disp"); // maybe it can?
120 Address array(rscratch, index._index, index._scale, index._disp);
121 return array;
122 }
123
124 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
125 Label L, E;
126
127 #ifdef _WIN64
128 // Windows always allocates space for it's register args
129 assert(num_args <= 4, "only register arguments supported");
130 subq(rsp, frame::arg_reg_save_area_bytes);
131 #endif
132
133 // Align stack if necessary
134 testl(rsp, 15);
135 jcc(Assembler::zero, L);
136
137 subq(rsp, 8);
138 call(RuntimeAddress(entry_point));
139 addq(rsp, 8);
140 jmp(E);
141
142 bind(L);
143 call(RuntimeAddress(entry_point));
144
145 bind(E);
146
147 #ifdef _WIN64
148 // restore stack pointer
149 addq(rsp, frame::arg_reg_save_area_bytes);
150 #endif
151 }
152
153 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
154 assert(!src2.is_lval(), "should use cmpptr");
155 assert(rscratch != noreg || always_reachable(src2), "missing");
156
157 if (reachable(src2)) {
158 cmpq(src1, as_Address(src2));
159 } else {
160 lea(rscratch, src2);
161 Assembler::cmpq(src1, Address(rscratch, 0));
162 }
163 }
164
165 int MacroAssembler::corrected_idivq(Register reg) {
166 // Full implementation of Java ldiv and lrem; checks for special
167 // case as described in JVM spec., p.243 & p.271. The function
168 // returns the (pc) offset of the idivl instruction - may be needed
169 // for implicit exceptions.
170 //
171 // normal case special case
172 //
173 // input : rax: dividend min_long
174 // reg: divisor (may not be eax/edx) -1
175 //
176 // output: rax: quotient (= rax idiv reg) min_long
177 // rdx: remainder (= rax irem reg) 0
178 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
179 static const int64_t min_long = 0x8000000000000000;
180 Label normal_case, special_case;
181
182 // check for special case
183 cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
184 jcc(Assembler::notEqual, normal_case);
185 xorl(rdx, rdx); // prepare rdx for possible special case (where
186 // remainder = 0)
187 cmpq(reg, -1);
188 jcc(Assembler::equal, special_case);
189
190 // handle normal case
191 bind(normal_case);
192 cdqq();
193 int idivq_offset = offset();
194 idivq(reg);
195
196 // normal and special case exit
197 bind(special_case);
198
199 return idivq_offset;
200 }
201
202 void MacroAssembler::decrementq(Register reg, int value) {
203 if (value == min_jint) { subq(reg, value); return; }
204 if (value < 0) { incrementq(reg, -value); return; }
205 if (value == 0) { ; return; }
206 if (value == 1 && UseIncDec) { decq(reg) ; return; }
207 /* else */ { subq(reg, value) ; return; }
208 }
209
210 void MacroAssembler::decrementq(Address dst, int value) {
211 if (value == min_jint) { subq(dst, value); return; }
212 if (value < 0) { incrementq(dst, -value); return; }
213 if (value == 0) { ; return; }
214 if (value == 1 && UseIncDec) { decq(dst) ; return; }
215 /* else */ { subq(dst, value) ; return; }
216 }
217
218 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
219 assert(rscratch != noreg || always_reachable(dst), "missing");
220
221 if (reachable(dst)) {
222 incrementq(as_Address(dst));
223 } else {
224 lea(rscratch, dst);
225 incrementq(Address(rscratch, 0));
226 }
227 }
228
229 void MacroAssembler::incrementq(Register reg, int value) {
230 if (value == min_jint) { addq(reg, value); return; }
231 if (value < 0) { decrementq(reg, -value); return; }
232 if (value == 0) { ; return; }
233 if (value == 1 && UseIncDec) { incq(reg) ; return; }
234 /* else */ { addq(reg, value) ; return; }
235 }
236
237 void MacroAssembler::incrementq(Address dst, int value) {
238 if (value == min_jint) { addq(dst, value); return; }
239 if (value < 0) { decrementq(dst, -value); return; }
240 if (value == 0) { ; return; }
241 if (value == 1 && UseIncDec) { incq(dst) ; return; }
242 /* else */ { addq(dst, value) ; return; }
243 }
244
245 // 32bit can do a case table jump in one instruction but we no longer allow the base
246 // to be installed in the Address class
247 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
248 lea(rscratch, entry.base());
249 Address dispatch = entry.index();
250 assert(dispatch._base == noreg, "must be");
251 dispatch._base = rscratch;
252 jmp(dispatch);
253 }
254
255 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
256 ShouldNotReachHere(); // 64bit doesn't use two regs
257 cmpq(x_lo, y_lo);
258 }
259
260 void MacroAssembler::lea(Register dst, AddressLiteral src) {
261 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
262 }
263
264 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
265 lea(rscratch, adr);
266 movptr(dst, rscratch);
267 }
268
269 void MacroAssembler::leave() {
270 // %%% is this really better? Why not on 32bit too?
271 emit_int8((unsigned char)0xC9); // LEAVE
272 }
273
274 void MacroAssembler::lneg(Register hi, Register lo) {
275 ShouldNotReachHere(); // 64bit doesn't use two regs
276 negq(lo);
277 }
278
279 void MacroAssembler::movoop(Register dst, jobject obj) {
280 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
281 }
282
283 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
284 mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
285 movq(dst, rscratch);
286 }
287
288 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
289 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
290 }
291
292 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
293 mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
294 movq(dst, rscratch);
295 }
296
297 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
298 if (src.is_lval()) {
299 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
300 } else {
301 if (reachable(src)) {
302 movq(dst, as_Address(src));
303 } else {
304 lea(dst, src);
305 movq(dst, Address(dst, 0));
306 }
307 }
308 }
309
310 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
311 movq(as_Address(dst, rscratch), src);
312 }
313
314 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
315 movq(dst, as_Address(src, dst /*rscratch*/));
316 }
317
318 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
319 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
320 if (is_simm32(src)) {
321 movptr(dst, checked_cast<int32_t>(src));
322 } else {
323 mov64(rscratch, src);
324 movq(dst, rscratch);
325 }
326 }
327
328 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
329 movoop(rscratch, obj);
330 push(rscratch);
331 }
332
333 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
334 mov_metadata(rscratch, obj);
335 push(rscratch);
336 }
337
338 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
339 lea(rscratch, src);
340 if (src.is_lval()) {
341 push(rscratch);
342 } else {
343 pushq(Address(rscratch, 0));
344 }
345 }
346
347 static void pass_arg0(MacroAssembler* masm, Register arg) {
348 if (c_rarg0 != arg ) {
349 masm->mov(c_rarg0, arg);
350 }
351 }
352
353 static void pass_arg1(MacroAssembler* masm, Register arg) {
354 if (c_rarg1 != arg ) {
355 masm->mov(c_rarg1, arg);
356 }
357 }
358
359 static void pass_arg2(MacroAssembler* masm, Register arg) {
360 if (c_rarg2 != arg ) {
361 masm->mov(c_rarg2, arg);
362 }
363 }
364
365 static void pass_arg3(MacroAssembler* masm, Register arg) {
366 if (c_rarg3 != arg ) {
367 masm->mov(c_rarg3, arg);
368 }
369 }
370
371 void MacroAssembler::stop(const char* msg) {
372 if (ShowMessageBoxOnError) {
373 address rip = pc();
374 pusha(); // get regs on stack
375 lea(c_rarg1, InternalAddress(rip));
376 movq(c_rarg2, rsp); // pass pointer to regs array
377 }
378 // Skip AOT caching C strings in scratch buffer.
379 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
380 lea(c_rarg0, ExternalAddress((address) str));
381 andq(rsp, -16); // align stack as required by ABI
382 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
383 hlt();
384 }
385
386 void MacroAssembler::warn(const char* msg) {
387 push(rbp);
388 movq(rbp, rsp);
389 andq(rsp, -16); // align stack as required by push_CPU_state and call
390 push_CPU_state(); // keeps alignment at 16 bytes
391
392 #ifdef _WIN64
393 // Windows always allocates space for its register args
394 subq(rsp, frame::arg_reg_save_area_bytes);
395 #endif
396 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
397 lea(c_rarg0, ExternalAddress((address) str));
398 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
399
400 #ifdef _WIN64
401 // restore stack pointer
402 addq(rsp, frame::arg_reg_save_area_bytes);
403 #endif
404 pop_CPU_state();
405 mov(rsp, rbp);
406 pop(rbp);
407 }
408
409 void MacroAssembler::print_state() {
410 address rip = pc();
411 pusha(); // get regs on stack
412 push(rbp);
413 movq(rbp, rsp);
414 andq(rsp, -16); // align stack as required by push_CPU_state and call
415 push_CPU_state(); // keeps alignment at 16 bytes
416
417 lea(c_rarg0, InternalAddress(rip));
418 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
419 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
420
421 pop_CPU_state();
422 mov(rsp, rbp);
423 pop(rbp);
424 popa();
425 }
426
427 #ifndef PRODUCT
428 extern "C" void findpc(intptr_t x);
429 #endif
430
431 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
432 // In order to get locks to work, we need to fake a in_VM state
433 if (ShowMessageBoxOnError) {
434 JavaThread* thread = JavaThread::current();
435 JavaThreadState saved_state = thread->thread_state();
436 thread->set_thread_state(_thread_in_vm);
437 #ifndef PRODUCT
438 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
439 ttyLocker ttyl;
440 BytecodeCounter::print();
441 }
442 #endif
443 // To see where a verify_oop failed, get $ebx+40/X for this frame.
444 // XXX correct this offset for amd64
445 // This is the value of eip which points to where verify_oop will return.
446 if (os::message_box(msg, "Execution stopped, print registers?")) {
447 print_state64(pc, regs);
448 BREAKPOINT;
449 }
450 }
451 fatal("DEBUG MESSAGE: %s", msg);
452 }
453
454 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
455 ttyLocker ttyl;
456 DebuggingContext debugging{};
457 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
458 #ifndef PRODUCT
459 tty->cr();
460 findpc(pc);
461 tty->cr();
462 #endif
463 #define PRINT_REG(rax, value) \
464 { tty->print("%s = ", #rax); os::print_location(tty, value); }
465 PRINT_REG(rax, regs[15]);
466 PRINT_REG(rbx, regs[12]);
467 PRINT_REG(rcx, regs[14]);
468 PRINT_REG(rdx, regs[13]);
469 PRINT_REG(rdi, regs[8]);
470 PRINT_REG(rsi, regs[9]);
471 PRINT_REG(rbp, regs[10]);
472 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
473 PRINT_REG(rsp, (intptr_t)(®s[16]));
474 PRINT_REG(r8 , regs[7]);
475 PRINT_REG(r9 , regs[6]);
476 PRINT_REG(r10, regs[5]);
477 PRINT_REG(r11, regs[4]);
478 PRINT_REG(r12, regs[3]);
479 PRINT_REG(r13, regs[2]);
480 PRINT_REG(r14, regs[1]);
481 PRINT_REG(r15, regs[0]);
482 #undef PRINT_REG
483 // Print some words near the top of the stack.
484 int64_t* rsp = ®s[16];
485 int64_t* dump_sp = rsp;
486 for (int col1 = 0; col1 < 8; col1++) {
487 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
488 os::print_location(tty, *dump_sp++);
489 }
490 for (int row = 0; row < 25; row++) {
491 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
492 for (int col = 0; col < 4; col++) {
493 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
494 }
495 tty->cr();
496 }
497 // Print some instructions around pc:
498 Disassembler::decode((address)pc-64, (address)pc);
499 tty->print_cr("--------");
500 Disassembler::decode((address)pc, (address)pc+32);
501 }
502
503 // The java_calling_convention describes stack locations as ideal slots on
504 // a frame with no abi restrictions. Since we must observe abi restrictions
505 // (like the placement of the register window) the slots must be biased by
506 // the following value.
507 static int reg2offset_in(VMReg r) {
508 // Account for saved rbp and return address
509 // This should really be in_preserve_stack_slots
510 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
511 }
512
513 static int reg2offset_out(VMReg r) {
514 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
515 }
516
517 // A long move
518 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
519
520 // The calling conventions assures us that each VMregpair is either
521 // all really one physical register or adjacent stack slots.
522
523 if (src.is_single_phys_reg() ) {
524 if (dst.is_single_phys_reg()) {
525 if (dst.first() != src.first()) {
526 mov(dst.first()->as_Register(), src.first()->as_Register());
527 }
528 } else {
529 assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
530 src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
531 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
532 }
533 } else if (dst.is_single_phys_reg()) {
534 assert(src.is_single_reg(), "not a stack pair");
535 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
536 } else {
537 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
538 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
539 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
540 }
541 }
542
543 // A double move
544 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
545
546 // The calling conventions assures us that each VMregpair is either
547 // all really one physical register or adjacent stack slots.
548
549 if (src.is_single_phys_reg() ) {
550 if (dst.is_single_phys_reg()) {
551 // In theory these overlap but the ordering is such that this is likely a nop
552 if ( src.first() != dst.first()) {
553 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
554 }
555 } else {
556 assert(dst.is_single_reg(), "not a stack pair");
557 movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
558 }
559 } else if (dst.is_single_phys_reg()) {
560 assert(src.is_single_reg(), "not a stack pair");
561 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
562 } else {
563 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
564 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
565 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
566 }
567 }
568
569
570 // A float arg may have to do float reg int reg conversion
571 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
572 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
573
574 // The calling conventions assures us that each VMregpair is either
575 // all really one physical register or adjacent stack slots.
576
577 if (src.first()->is_stack()) {
578 if (dst.first()->is_stack()) {
579 movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
580 movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
581 } else {
582 // stack to reg
583 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
584 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
585 }
586 } else if (dst.first()->is_stack()) {
587 // reg to stack
588 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
589 movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
590 } else {
591 // reg to reg
592 // In theory these overlap but the ordering is such that this is likely a nop
593 if ( src.first() != dst.first()) {
594 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
595 }
596 }
597 }
598
599 // On 64 bit we will store integer like items to the stack as
600 // 64 bits items (x86_32/64 abi) even though java would only store
601 // 32bits for a parameter. On 32bit it will simply be 32 bits
602 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
603 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
604 if (src.first()->is_stack()) {
605 if (dst.first()->is_stack()) {
606 // stack to stack
607 movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
608 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
609 } else {
610 // stack to reg
611 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
612 }
613 } else if (dst.first()->is_stack()) {
614 // reg to stack
615 // Do we really have to sign extend???
616 // __ movslq(src.first()->as_Register(), src.first()->as_Register());
617 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
618 } else {
619 // Do we really have to sign extend???
620 // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
621 if (dst.first() != src.first()) {
622 movq(dst.first()->as_Register(), src.first()->as_Register());
623 }
624 }
625 }
626
627 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
628 if (src.first()->is_stack()) {
629 if (dst.first()->is_stack()) {
630 // stack to stack
631 movq(rax, Address(rbp, reg2offset_in(src.first())));
632 movq(Address(rsp, reg2offset_out(dst.first())), rax);
633 } else {
634 // stack to reg
635 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
636 }
637 } else if (dst.first()->is_stack()) {
638 // reg to stack
639 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
640 } else {
641 if (dst.first() != src.first()) {
642 movq(dst.first()->as_Register(), src.first()->as_Register());
643 }
644 }
645 }
646
647 // An oop arg. Must pass a handle not the oop itself
648 void MacroAssembler::object_move(OopMap* map,
649 int oop_handle_offset,
650 int framesize_in_slots,
651 VMRegPair src,
652 VMRegPair dst,
653 bool is_receiver,
654 int* receiver_offset) {
655
656 // must pass a handle. First figure out the location we use as a handle
657
658 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
659
660 // See if oop is null if it is we need no handle
661
662 if (src.first()->is_stack()) {
663
664 // Oop is already on the stack as an argument
665 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
666 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
667 if (is_receiver) {
668 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
669 }
670
671 cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
672 lea(rHandle, Address(rbp, reg2offset_in(src.first())));
673 // conditionally move a null
674 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
675 } else {
676
677 // Oop is in a register we must store it to the space we reserve
678 // on the stack for oop_handles and pass a handle if oop is non-null
679
680 const Register rOop = src.first()->as_Register();
681 int oop_slot;
682 if (rOop == j_rarg0)
683 oop_slot = 0;
684 else if (rOop == j_rarg1)
685 oop_slot = 1;
686 else if (rOop == j_rarg2)
687 oop_slot = 2;
688 else if (rOop == j_rarg3)
689 oop_slot = 3;
690 else if (rOop == j_rarg4)
691 oop_slot = 4;
692 else {
693 assert(rOop == j_rarg5, "wrong register");
694 oop_slot = 5;
695 }
696
697 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
698 int offset = oop_slot*VMRegImpl::stack_slot_size;
699
700 map->set_oop(VMRegImpl::stack2reg(oop_slot));
701 // Store oop in handle area, may be null
702 movptr(Address(rsp, offset), rOop);
703 if (is_receiver) {
704 *receiver_offset = offset;
705 }
706
707 cmpptr(rOop, NULL_WORD);
708 lea(rHandle, Address(rsp, offset));
709 // conditionally move a null from the handle area where it was just stored
710 cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
711 }
712
713 // If arg is on the stack then place it otherwise it is already in correct reg.
714 if (dst.first()->is_stack()) {
715 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
716 }
717 }
718
719 void MacroAssembler::addptr(Register dst, int32_t imm32) {
720 addq(dst, imm32);
721 }
722
723 void MacroAssembler::addptr(Register dst, Register src) {
724 addq(dst, src);
725 }
726
727 void MacroAssembler::addptr(Address dst, Register src) {
728 addq(dst, src);
729 }
730
731 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
732 assert(rscratch != noreg || always_reachable(src), "missing");
733
734 if (reachable(src)) {
735 Assembler::addsd(dst, as_Address(src));
736 } else {
737 lea(rscratch, src);
738 Assembler::addsd(dst, Address(rscratch, 0));
739 }
740 }
741
742 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
743 assert(rscratch != noreg || always_reachable(src), "missing");
744
745 if (reachable(src)) {
746 addss(dst, as_Address(src));
747 } else {
748 lea(rscratch, src);
749 addss(dst, Address(rscratch, 0));
750 }
751 }
752
753 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
754 assert(rscratch != noreg || always_reachable(src), "missing");
755
756 if (reachable(src)) {
757 Assembler::addpd(dst, as_Address(src));
758 } else {
759 lea(rscratch, src);
760 Assembler::addpd(dst, Address(rscratch, 0));
761 }
762 }
763
764 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only.
765 // Stub code is generated once and never copied.
766 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
767 void MacroAssembler::align64() {
768 align(64, (uint)(uintptr_t)pc());
769 }
770
771 void MacroAssembler::align32() {
772 align(32, (uint)(uintptr_t)pc());
773 }
774
775 void MacroAssembler::align(uint modulus) {
776 // 8273459: Ensure alignment is possible with current segment alignment
777 assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
778 align(modulus, offset());
779 }
780
781 void MacroAssembler::align(uint modulus, uint target) {
782 if (target % modulus != 0) {
783 nop(modulus - (target % modulus));
784 }
785 }
786
787 void MacroAssembler::push_f(XMMRegister r) {
788 subptr(rsp, wordSize);
789 movflt(Address(rsp, 0), r);
790 }
791
792 void MacroAssembler::pop_f(XMMRegister r) {
793 movflt(r, Address(rsp, 0));
794 addptr(rsp, wordSize);
795 }
796
797 void MacroAssembler::push_d(XMMRegister r) {
798 subptr(rsp, 2 * wordSize);
799 movdbl(Address(rsp, 0), r);
800 }
801
802 void MacroAssembler::pop_d(XMMRegister r) {
803 movdbl(r, Address(rsp, 0));
804 addptr(rsp, 2 * Interpreter::stackElementSize);
805 }
806
807 void MacroAssembler::push_ppx(Register src) {
808 if (VM_Version::supports_apx_f()) {
809 pushp(src);
810 } else {
811 Assembler::push(src);
812 }
813 }
814
815 void MacroAssembler::pop_ppx(Register dst) {
816 if (VM_Version::supports_apx_f()) {
817 popp(dst);
818 } else {
819 Assembler::pop(dst);
820 }
821 }
822
823 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
824 // Used in sign-masking with aligned address.
825 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
826 assert(rscratch != noreg || always_reachable(src), "missing");
827
828 if (UseAVX > 2 &&
829 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
830 (dst->encoding() >= 16)) {
831 vpand(dst, dst, src, AVX_512bit, rscratch);
832 } else if (reachable(src)) {
833 Assembler::andpd(dst, as_Address(src));
834 } else {
835 lea(rscratch, src);
836 Assembler::andpd(dst, Address(rscratch, 0));
837 }
838 }
839
840 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
841 // Used in sign-masking with aligned address.
842 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
843 assert(rscratch != noreg || always_reachable(src), "missing");
844
845 if (reachable(src)) {
846 Assembler::andps(dst, as_Address(src));
847 } else {
848 lea(rscratch, src);
849 Assembler::andps(dst, Address(rscratch, 0));
850 }
851 }
852
853 void MacroAssembler::andptr(Register dst, int32_t imm32) {
854 andq(dst, imm32);
855 }
856
857 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
858 assert(rscratch != noreg || always_reachable(src), "missing");
859
860 if (reachable(src)) {
861 andq(dst, as_Address(src));
862 } else {
863 lea(rscratch, src);
864 andq(dst, Address(rscratch, 0));
865 }
866 }
867
868 void MacroAssembler::atomic_incl(Address counter_addr) {
869 lock();
870 incrementl(counter_addr);
871 }
872
873 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
874 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
875
876 if (reachable(counter_addr)) {
877 atomic_incl(as_Address(counter_addr));
878 } else {
879 lea(rscratch, counter_addr);
880 atomic_incl(Address(rscratch, 0));
881 }
882 }
883
884 void MacroAssembler::atomic_incq(Address counter_addr) {
885 lock();
886 incrementq(counter_addr);
887 }
888
889 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
890 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
891
892 if (reachable(counter_addr)) {
893 atomic_incq(as_Address(counter_addr));
894 } else {
895 lea(rscratch, counter_addr);
896 atomic_incq(Address(rscratch, 0));
897 }
898 }
899
900 // Writes to stack successive pages until offset reached to check for
901 // stack overflow + shadow pages. This clobbers tmp.
902 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
903 movptr(tmp, rsp);
904 // Bang stack for total size given plus shadow page size.
905 // Bang one page at a time because large size can bang beyond yellow and
906 // red zones.
907 Label loop;
908 bind(loop);
909 movl(Address(tmp, (-(int)os::vm_page_size())), size );
910 subptr(tmp, (int)os::vm_page_size());
911 subl(size, (int)os::vm_page_size());
912 jcc(Assembler::greater, loop);
913
914 // Bang down shadow pages too.
915 // At this point, (tmp-0) is the last address touched, so don't
916 // touch it again. (It was touched as (tmp-pagesize) but then tmp
917 // was post-decremented.) Skip this address by starting at i=1, and
918 // touch a few more pages below. N.B. It is important to touch all
919 // the way down including all pages in the shadow zone.
920 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
921 // this could be any sized move but this is can be a debugging crumb
922 // so the bigger the better.
923 movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
924 }
925 }
926
927 void MacroAssembler::reserved_stack_check() {
928 // testing if reserved zone needs to be enabled
929 Label no_reserved_zone_enabling;
930
931 cmpptr(rsp, Address(r15_thread, JavaThread::reserved_stack_activation_offset()));
932 jcc(Assembler::below, no_reserved_zone_enabling);
933
934 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), r15_thread);
935 jump(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
936 should_not_reach_here();
937
938 bind(no_reserved_zone_enabling);
939 }
940
941 void MacroAssembler::c2bool(Register x) {
942 // implements x == 0 ? 0 : 1
943 // note: must only look at least-significant byte of x
944 // since C-style booleans are stored in one byte
945 // only! (was bug)
946 andl(x, 0xFF);
947 setb(Assembler::notZero, x);
948 }
949
950 // Wouldn't need if AddressLiteral version had new name
951 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
952 Assembler::call(L, rtype);
953 }
954
955 void MacroAssembler::call(Register entry) {
956 Assembler::call(entry);
957 }
958
959 void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
960 assert(rscratch != noreg || always_reachable(entry), "missing");
961
962 if (reachable(entry)) {
963 Assembler::call_literal(entry.target(), entry.rspec());
964 } else {
965 lea(rscratch, entry);
966 Assembler::call(rscratch);
967 }
968 }
969
970 void MacroAssembler::ic_call(address entry, jint method_index) {
971 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
972 // Needs full 64-bit immediate for later patching.
973 Assembler::mov64(rax, (int64_t)Universe::non_oop_word());
974 call(AddressLiteral(entry, rh));
975 }
976
977 int MacroAssembler::ic_check_size() {
978 return UseCompactObjectHeaders ? 17 : 14;
979 }
980
981 int MacroAssembler::ic_check(int end_alignment) {
982 Register receiver = j_rarg0;
983 Register data = rax;
984 Register temp = rscratch1;
985
986 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
987 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
988 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
989 // before the inline cache check here, and not after
990 align(end_alignment, offset() + ic_check_size());
991
992 int uep_offset = offset();
993
994 if (UseCompactObjectHeaders) {
995 load_narrow_klass_compact(temp, receiver);
996 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
997 } else {
998 movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
999 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
1000 }
1001
1002 // if inline cache check fails, then jump to runtime routine
1003 jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1004 assert((offset() % end_alignment) == 0, "Misaligned verified entry point (%d, %d, %d)", uep_offset, offset(), end_alignment);
1005
1006 return uep_offset;
1007 }
1008
1009 void MacroAssembler::emit_static_call_stub() {
1010 // Static stub relocation also tags the Method* in the code-stream.
1011 mov_metadata(rbx, (Metadata*) nullptr); // Method is zapped till fixup time.
1012 // This is recognized as unresolved by relocs/nativeinst/ic code.
1013 jump(RuntimeAddress(pc()));
1014 }
1015
1016 // Implementation of call_VM versions
1017
1018 void MacroAssembler::call_VM(Register oop_result,
1019 address entry_point,
1020 bool check_exceptions) {
1021 Label C, E;
1022 call(C, relocInfo::none);
1023 jmp(E);
1024
1025 bind(C);
1026 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1027 ret(0);
1028
1029 bind(E);
1030 }
1031
1032 void MacroAssembler::call_VM(Register oop_result,
1033 address entry_point,
1034 Register arg_1,
1035 bool check_exceptions) {
1036 Label C, E;
1037 call(C, relocInfo::none);
1038 jmp(E);
1039
1040 bind(C);
1041 pass_arg1(this, arg_1);
1042 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1043 ret(0);
1044
1045 bind(E);
1046 }
1047
1048 void MacroAssembler::call_VM(Register oop_result,
1049 address entry_point,
1050 Register arg_1,
1051 Register arg_2,
1052 bool check_exceptions) {
1053 Label C, E;
1054 call(C, relocInfo::none);
1055 jmp(E);
1056
1057 bind(C);
1058
1059 assert_different_registers(arg_1, c_rarg2);
1060
1061 pass_arg2(this, arg_2);
1062 pass_arg1(this, arg_1);
1063 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1064 ret(0);
1065
1066 bind(E);
1067 }
1068
1069 void MacroAssembler::call_VM(Register oop_result,
1070 address entry_point,
1071 Register arg_1,
1072 Register arg_2,
1073 Register arg_3,
1074 bool check_exceptions) {
1075 Label C, E;
1076 call(C, relocInfo::none);
1077 jmp(E);
1078
1079 bind(C);
1080
1081 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1082 assert_different_registers(arg_2, c_rarg3);
1083 pass_arg3(this, arg_3);
1084 pass_arg2(this, arg_2);
1085 pass_arg1(this, arg_1);
1086 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1087 ret(0);
1088
1089 bind(E);
1090 }
1091
1092 void MacroAssembler::call_VM(Register oop_result,
1093 Register last_java_sp,
1094 address entry_point,
1095 int number_of_arguments,
1096 bool check_exceptions) {
1097 call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1098 }
1099
1100 void MacroAssembler::call_VM(Register oop_result,
1101 Register last_java_sp,
1102 address entry_point,
1103 Register arg_1,
1104 bool check_exceptions) {
1105 pass_arg1(this, arg_1);
1106 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1107 }
1108
1109 void MacroAssembler::call_VM(Register oop_result,
1110 Register last_java_sp,
1111 address entry_point,
1112 Register arg_1,
1113 Register arg_2,
1114 bool check_exceptions) {
1115
1116 assert_different_registers(arg_1, c_rarg2);
1117 pass_arg2(this, arg_2);
1118 pass_arg1(this, arg_1);
1119 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1120 }
1121
1122 void MacroAssembler::call_VM(Register oop_result,
1123 Register last_java_sp,
1124 address entry_point,
1125 Register arg_1,
1126 Register arg_2,
1127 Register arg_3,
1128 bool check_exceptions) {
1129 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1130 assert_different_registers(arg_2, c_rarg3);
1131 pass_arg3(this, arg_3);
1132 pass_arg2(this, arg_2);
1133 pass_arg1(this, arg_1);
1134 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1135 }
1136
1137 void MacroAssembler::super_call_VM(Register oop_result,
1138 Register last_java_sp,
1139 address entry_point,
1140 int number_of_arguments,
1141 bool check_exceptions) {
1142 MacroAssembler::call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1143 }
1144
1145 void MacroAssembler::super_call_VM(Register oop_result,
1146 Register last_java_sp,
1147 address entry_point,
1148 Register arg_1,
1149 bool check_exceptions) {
1150 pass_arg1(this, arg_1);
1151 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1152 }
1153
1154 void MacroAssembler::super_call_VM(Register oop_result,
1155 Register last_java_sp,
1156 address entry_point,
1157 Register arg_1,
1158 Register arg_2,
1159 bool check_exceptions) {
1160
1161 assert_different_registers(arg_1, c_rarg2);
1162 pass_arg2(this, arg_2);
1163 pass_arg1(this, arg_1);
1164 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1165 }
1166
1167 void MacroAssembler::super_call_VM(Register oop_result,
1168 Register last_java_sp,
1169 address entry_point,
1170 Register arg_1,
1171 Register arg_2,
1172 Register arg_3,
1173 bool check_exceptions) {
1174 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1175 assert_different_registers(arg_2, c_rarg3);
1176 pass_arg3(this, arg_3);
1177 pass_arg2(this, arg_2);
1178 pass_arg1(this, arg_1);
1179 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1180 }
1181
1182 void MacroAssembler::call_VM_base(Register oop_result,
1183 Register last_java_sp,
1184 address entry_point,
1185 int number_of_arguments,
1186 bool check_exceptions) {
1187 Register java_thread = r15_thread;
1188
1189 // determine last_java_sp register
1190 if (!last_java_sp->is_valid()) {
1191 last_java_sp = rsp;
1192 }
1193 // debugging support
1194 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1195 #ifdef ASSERT
1196 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1197 // r12 is the heapbase.
1198 if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
1199 #endif // ASSERT
1200
1201 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1202 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1203
1204 // push java thread (becomes first argument of C function)
1205
1206 mov(c_rarg0, r15_thread);
1207
1208 // set last Java frame before call
1209 assert(last_java_sp != rbp, "can't use ebp/rbp");
1210
1211 // Only interpreter should have to set fp
1212 set_last_Java_frame(last_java_sp, rbp, nullptr, rscratch1);
1213
1214 // do the call, remove parameters
1215 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1216
1217 #ifdef ASSERT
1218 // Check that thread register is not clobbered.
1219 guarantee(java_thread != rax, "change this code");
1220 push(rax);
1221 { Label L;
1222 get_thread_slow(rax);
1223 cmpptr(java_thread, rax);
1224 jcc(Assembler::equal, L);
1225 STOP("MacroAssembler::call_VM_base: java_thread not callee saved?");
1226 bind(L);
1227 }
1228 pop(rax);
1229 #endif
1230
1231 // reset last Java frame
1232 // Only interpreter should have to clear fp
1233 reset_last_Java_frame(true);
1234
1235 // C++ interp handles this in the interpreter
1236 check_and_handle_popframe();
1237 check_and_handle_earlyret();
1238
1239 if (check_exceptions) {
1240 // check for pending exceptions (java_thread is set upon return)
1241 cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1242 // This used to conditionally jump to forward_exception however it is
1243 // possible if we relocate that the branch will not reach. So we must jump
1244 // around so we can always reach
1245
1246 Label ok;
1247 jcc(Assembler::equal, ok);
1248 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1249 bind(ok);
1250 }
1251
1252 // get oop result if there is one and reset the value in the thread
1253 if (oop_result->is_valid()) {
1254 get_vm_result_oop(oop_result);
1255 }
1256 }
1257
1258 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1259 // Calculate the value for last_Java_sp somewhat subtle.
1260 // call_VM does an intermediate call which places a return address on
1261 // the stack just under the stack pointer as the user finished with it.
1262 // This allows use to retrieve last_Java_pc from last_Java_sp[-1].
1263
1264 // We've pushed one address, correct last_Java_sp
1265 lea(rax, Address(rsp, wordSize));
1266
1267 call_VM_base(oop_result, rax, entry_point, number_of_arguments, check_exceptions);
1268 }
1269
1270 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1271 void MacroAssembler::call_VM_leaf0(address entry_point) {
1272 MacroAssembler::call_VM_leaf_base(entry_point, 0);
1273 }
1274
1275 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1276 call_VM_leaf_base(entry_point, number_of_arguments);
1277 }
1278
1279 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1280 pass_arg0(this, arg_0);
1281 call_VM_leaf(entry_point, 1);
1282 }
1283
1284 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1285
1286 assert_different_registers(arg_0, c_rarg1);
1287 pass_arg1(this, arg_1);
1288 pass_arg0(this, arg_0);
1289 call_VM_leaf(entry_point, 2);
1290 }
1291
1292 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1293 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1294 assert_different_registers(arg_1, c_rarg2);
1295 pass_arg2(this, arg_2);
1296 pass_arg1(this, arg_1);
1297 pass_arg0(this, arg_0);
1298 call_VM_leaf(entry_point, 3);
1299 }
1300
1301 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1302 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1303 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1304 assert_different_registers(arg_2, c_rarg3);
1305 pass_arg3(this, arg_3);
1306 pass_arg2(this, arg_2);
1307 pass_arg1(this, arg_1);
1308 pass_arg0(this, arg_0);
1309 call_VM_leaf(entry_point, 3);
1310 }
1311
1312 void MacroAssembler::super_call_VM_leaf(address entry_point) {
1313 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1314 }
1315
1316 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1317 pass_arg0(this, arg_0);
1318 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1319 }
1320
1321 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1322 assert_different_registers(arg_0, c_rarg1);
1323 pass_arg1(this, arg_1);
1324 pass_arg0(this, arg_0);
1325 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1326 }
1327
1328 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1329 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1330 assert_different_registers(arg_1, c_rarg2);
1331 pass_arg2(this, arg_2);
1332 pass_arg1(this, arg_1);
1333 pass_arg0(this, arg_0);
1334 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1335 }
1336
1337 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1338 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1339 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1340 assert_different_registers(arg_2, c_rarg3);
1341 pass_arg3(this, arg_3);
1342 pass_arg2(this, arg_2);
1343 pass_arg1(this, arg_1);
1344 pass_arg0(this, arg_0);
1345 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1346 }
1347
1348 void MacroAssembler::get_vm_result_oop(Register oop_result) {
1349 movptr(oop_result, Address(r15_thread, JavaThread::vm_result_oop_offset()));
1350 movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
1351 verify_oop_msg(oop_result, "broken oop in call_VM_base");
1352 }
1353
1354 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
1355 movptr(metadata_result, Address(r15_thread, JavaThread::vm_result_metadata_offset()));
1356 movptr(Address(r15_thread, JavaThread::vm_result_metadata_offset()), NULL_WORD);
1357 }
1358
1359 void MacroAssembler::check_and_handle_earlyret() {
1360 }
1361
1362 void MacroAssembler::check_and_handle_popframe() {
1363 }
1364
1365 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1366 assert(rscratch != noreg || always_reachable(src1), "missing");
1367
1368 if (reachable(src1)) {
1369 cmpl(as_Address(src1), imm);
1370 } else {
1371 lea(rscratch, src1);
1372 cmpl(Address(rscratch, 0), imm);
1373 }
1374 }
1375
1376 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1377 assert(!src2.is_lval(), "use cmpptr");
1378 assert(rscratch != noreg || always_reachable(src2), "missing");
1379
1380 if (reachable(src2)) {
1381 cmpl(src1, as_Address(src2));
1382 } else {
1383 lea(rscratch, src2);
1384 cmpl(src1, Address(rscratch, 0));
1385 }
1386 }
1387
1388 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1389 Assembler::cmpl(src1, imm);
1390 }
1391
1392 void MacroAssembler::cmp32(Register src1, Address src2) {
1393 Assembler::cmpl(src1, src2);
1394 }
1395
1396 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1397 ucomisd(opr1, opr2);
1398
1399 Label L;
1400 if (unordered_is_less) {
1401 movl(dst, -1);
1402 jcc(Assembler::parity, L);
1403 jcc(Assembler::below , L);
1404 movl(dst, 0);
1405 jcc(Assembler::equal , L);
1406 increment(dst);
1407 } else { // unordered is greater
1408 movl(dst, 1);
1409 jcc(Assembler::parity, L);
1410 jcc(Assembler::above , L);
1411 movl(dst, 0);
1412 jcc(Assembler::equal , L);
1413 decrementl(dst);
1414 }
1415 bind(L);
1416 }
1417
1418 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1419 ucomiss(opr1, opr2);
1420
1421 Label L;
1422 if (unordered_is_less) {
1423 movl(dst, -1);
1424 jcc(Assembler::parity, L);
1425 jcc(Assembler::below , L);
1426 movl(dst, 0);
1427 jcc(Assembler::equal , L);
1428 increment(dst);
1429 } else { // unordered is greater
1430 movl(dst, 1);
1431 jcc(Assembler::parity, L);
1432 jcc(Assembler::above , L);
1433 movl(dst, 0);
1434 jcc(Assembler::equal , L);
1435 decrementl(dst);
1436 }
1437 bind(L);
1438 }
1439
1440
1441 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1442 assert(rscratch != noreg || always_reachable(src1), "missing");
1443
1444 if (reachable(src1)) {
1445 cmpb(as_Address(src1), imm);
1446 } else {
1447 lea(rscratch, src1);
1448 cmpb(Address(rscratch, 0), imm);
1449 }
1450 }
1451
1452 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1453 assert(rscratch != noreg || always_reachable(src2), "missing");
1454
1455 if (src2.is_lval()) {
1456 movptr(rscratch, src2);
1457 Assembler::cmpq(src1, rscratch);
1458 } else if (reachable(src2)) {
1459 cmpq(src1, as_Address(src2));
1460 } else {
1461 lea(rscratch, src2);
1462 Assembler::cmpq(src1, Address(rscratch, 0));
1463 }
1464 }
1465
1466 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1467 assert(src2.is_lval(), "not a mem-mem compare");
1468 // moves src2's literal address
1469 movptr(rscratch, src2);
1470 Assembler::cmpq(src1, rscratch);
1471 }
1472
1473 void MacroAssembler::cmpoop(Register src1, Register src2) {
1474 cmpptr(src1, src2);
1475 }
1476
1477 void MacroAssembler::cmpoop(Register src1, Address src2) {
1478 cmpptr(src1, src2);
1479 }
1480
1481 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1482 movoop(rscratch, src2);
1483 cmpptr(src1, rscratch);
1484 }
1485
1486 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1487 assert(rscratch != noreg || always_reachable(adr), "missing");
1488
1489 if (reachable(adr)) {
1490 lock();
1491 cmpxchgptr(reg, as_Address(adr));
1492 } else {
1493 lea(rscratch, adr);
1494 lock();
1495 cmpxchgptr(reg, Address(rscratch, 0));
1496 }
1497 }
1498
1499 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1500 cmpxchgq(reg, adr);
1501 }
1502
1503 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1504 assert(rscratch != noreg || always_reachable(src), "missing");
1505
1506 if (reachable(src)) {
1507 Assembler::comisd(dst, as_Address(src));
1508 } else {
1509 lea(rscratch, src);
1510 Assembler::comisd(dst, Address(rscratch, 0));
1511 }
1512 }
1513
1514 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1515 assert(rscratch != noreg || always_reachable(src), "missing");
1516
1517 if (reachable(src)) {
1518 Assembler::comiss(dst, as_Address(src));
1519 } else {
1520 lea(rscratch, src);
1521 Assembler::comiss(dst, Address(rscratch, 0));
1522 }
1523 }
1524
1525
1526 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1527 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1528
1529 Condition negated_cond = negate_condition(cond);
1530 Label L;
1531 jcc(negated_cond, L);
1532 pushf(); // Preserve flags
1533 atomic_incl(counter_addr, rscratch);
1534 popf();
1535 bind(L);
1536 }
1537
1538 int MacroAssembler::corrected_idivl(Register reg) {
1539 // Full implementation of Java idiv and irem; checks for
1540 // special case as described in JVM spec., p.243 & p.271.
1541 // The function returns the (pc) offset of the idivl
1542 // instruction - may be needed for implicit exceptions.
1543 //
1544 // normal case special case
1545 //
1546 // input : rax,: dividend min_int
1547 // reg: divisor (may not be rax,/rdx) -1
1548 //
1549 // output: rax,: quotient (= rax, idiv reg) min_int
1550 // rdx: remainder (= rax, irem reg) 0
1551 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1552 const int min_int = 0x80000000;
1553 Label normal_case, special_case;
1554
1555 // check for special case
1556 cmpl(rax, min_int);
1557 jcc(Assembler::notEqual, normal_case);
1558 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1559 cmpl(reg, -1);
1560 jcc(Assembler::equal, special_case);
1561
1562 // handle normal case
1563 bind(normal_case);
1564 cdql();
1565 int idivl_offset = offset();
1566 idivl(reg);
1567
1568 // normal and special case exit
1569 bind(special_case);
1570
1571 return idivl_offset;
1572 }
1573
1574
1575
1576 void MacroAssembler::decrementl(Register reg, int value) {
1577 if (value == min_jint) {subl(reg, value) ; return; }
1578 if (value < 0) { incrementl(reg, -value); return; }
1579 if (value == 0) { ; return; }
1580 if (value == 1 && UseIncDec) { decl(reg) ; return; }
1581 /* else */ { subl(reg, value) ; return; }
1582 }
1583
1584 void MacroAssembler::decrementl(Address dst, int value) {
1585 if (value == min_jint) {subl(dst, value) ; return; }
1586 if (value < 0) { incrementl(dst, -value); return; }
1587 if (value == 0) { ; return; }
1588 if (value == 1 && UseIncDec) { decl(dst) ; return; }
1589 /* else */ { subl(dst, value) ; return; }
1590 }
1591
1592 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1593 assert(shift_value > 0, "illegal shift value");
1594 Label _is_positive;
1595 testl (reg, reg);
1596 jcc (Assembler::positive, _is_positive);
1597 int offset = (1 << shift_value) - 1 ;
1598
1599 if (offset == 1) {
1600 incrementl(reg);
1601 } else {
1602 addl(reg, offset);
1603 }
1604
1605 bind (_is_positive);
1606 sarl(reg, shift_value);
1607 }
1608
1609 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1610 assert(rscratch != noreg || always_reachable(src), "missing");
1611
1612 if (reachable(src)) {
1613 Assembler::divsd(dst, as_Address(src));
1614 } else {
1615 lea(rscratch, src);
1616 Assembler::divsd(dst, Address(rscratch, 0));
1617 }
1618 }
1619
1620 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1621 assert(rscratch != noreg || always_reachable(src), "missing");
1622
1623 if (reachable(src)) {
1624 Assembler::divss(dst, as_Address(src));
1625 } else {
1626 lea(rscratch, src);
1627 Assembler::divss(dst, Address(rscratch, 0));
1628 }
1629 }
1630
1631 void MacroAssembler::enter() {
1632 push(rbp);
1633 mov(rbp, rsp);
1634 }
1635
1636 void MacroAssembler::post_call_nop() {
1637 if (!Continuations::enabled()) {
1638 return;
1639 }
1640 InstructionMark im(this);
1641 relocate(post_call_nop_Relocation::spec());
1642 InlineSkippedInstructionsCounter skipCounter(this);
1643 emit_int8((uint8_t)0x0f);
1644 emit_int8((uint8_t)0x1f);
1645 emit_int8((uint8_t)0x84);
1646 emit_int8((uint8_t)0x00);
1647 emit_int32(0x00);
1648 }
1649
1650 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1651 assert(rscratch != noreg || always_reachable(src), "missing");
1652 if (reachable(src)) {
1653 Assembler::mulpd(dst, as_Address(src));
1654 } else {
1655 lea(rscratch, src);
1656 Assembler::mulpd(dst, Address(rscratch, 0));
1657 }
1658 }
1659
1660 // dst = c = a * b + c
1661 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1662 Assembler::vfmadd231sd(c, a, b);
1663 if (dst != c) {
1664 movdbl(dst, c);
1665 }
1666 }
1667
1668 // dst = c = a * b + c
1669 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1670 Assembler::vfmadd231ss(c, a, b);
1671 if (dst != c) {
1672 movflt(dst, c);
1673 }
1674 }
1675
1676 // dst = c = a * b + c
1677 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1678 Assembler::vfmadd231pd(c, a, b, vector_len);
1679 if (dst != c) {
1680 vmovdqu(dst, c);
1681 }
1682 }
1683
1684 // dst = c = a * b + c
1685 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1686 Assembler::vfmadd231ps(c, a, b, vector_len);
1687 if (dst != c) {
1688 vmovdqu(dst, c);
1689 }
1690 }
1691
1692 // dst = c = a * b + c
1693 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1694 Assembler::vfmadd231pd(c, a, b, vector_len);
1695 if (dst != c) {
1696 vmovdqu(dst, c);
1697 }
1698 }
1699
1700 // dst = c = a * b + c
1701 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1702 Assembler::vfmadd231ps(c, a, b, vector_len);
1703 if (dst != c) {
1704 vmovdqu(dst, c);
1705 }
1706 }
1707
1708 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
1709 assert(rscratch != noreg || always_reachable(dst), "missing");
1710
1711 if (reachable(dst)) {
1712 incrementl(as_Address(dst));
1713 } else {
1714 lea(rscratch, dst);
1715 incrementl(Address(rscratch, 0));
1716 }
1717 }
1718
1719 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
1720 incrementl(as_Address(dst, rscratch));
1721 }
1722
1723 void MacroAssembler::incrementl(Register reg, int value) {
1724 if (value == min_jint) {addl(reg, value) ; return; }
1725 if (value < 0) { decrementl(reg, -value); return; }
1726 if (value == 0) { ; return; }
1727 if (value == 1 && UseIncDec) { incl(reg) ; return; }
1728 /* else */ { addl(reg, value) ; return; }
1729 }
1730
1731 void MacroAssembler::incrementl(Address dst, int value) {
1732 if (value == min_jint) {addl(dst, value) ; return; }
1733 if (value < 0) { decrementl(dst, -value); return; }
1734 if (value == 0) { ; return; }
1735 if (value == 1 && UseIncDec) { incl(dst) ; return; }
1736 /* else */ { addl(dst, value) ; return; }
1737 }
1738
1739 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
1740 assert(rscratch != noreg || always_reachable(dst), "missing");
1741 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump");
1742 if (reachable(dst)) {
1743 jmp_literal(dst.target(), dst.rspec());
1744 } else {
1745 lea(rscratch, dst);
1746 jmp(rscratch);
1747 }
1748 }
1749
1750 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
1751 assert(rscratch != noreg || always_reachable(dst), "missing");
1752 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump_cc");
1753 if (reachable(dst)) {
1754 InstructionMark im(this);
1755 relocate(dst.reloc());
1756 const int short_size = 2;
1757 const int long_size = 6;
1758 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
1759 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
1760 // 0111 tttn #8-bit disp
1761 emit_int8(0x70 | cc);
1762 emit_int8((offs - short_size) & 0xFF);
1763 } else {
1764 // 0000 1111 1000 tttn #32-bit disp
1765 emit_int8(0x0F);
1766 emit_int8((unsigned char)(0x80 | cc));
1767 emit_int32(offs - long_size);
1768 }
1769 } else {
1770 #ifdef ASSERT
1771 warning("reversing conditional branch");
1772 #endif /* ASSERT */
1773 Label skip;
1774 jccb(reverse[cc], skip);
1775 lea(rscratch, dst);
1776 Assembler::jmp(rscratch);
1777 bind(skip);
1778 }
1779 }
1780
1781 void MacroAssembler::cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch) {
1782 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
1783 assert(rscratch != noreg || always_reachable(mxcsr_std), "missing");
1784
1785 stmxcsr(mxcsr_save);
1786 movl(tmp, mxcsr_save);
1787 if (EnableX86ECoreOpts) {
1788 // The mxcsr_std has status bits set for performance on ECore
1789 orl(tmp, 0x003f);
1790 } else {
1791 // Mask out status bits (only check control and mask bits)
1792 andl(tmp, 0xFFC0);
1793 }
1794 cmp32(tmp, mxcsr_std, rscratch);
1795 }
1796
1797 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
1798 assert(rscratch != noreg || always_reachable(src), "missing");
1799
1800 if (reachable(src)) {
1801 Assembler::ldmxcsr(as_Address(src));
1802 } else {
1803 lea(rscratch, src);
1804 Assembler::ldmxcsr(Address(rscratch, 0));
1805 }
1806 }
1807
1808 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1809 int off = offset();
1810 movsbl(dst, src); // movsxb
1811 return off;
1812 }
1813
1814 // Note: load_signed_short used to be called load_signed_word.
1815 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
1816 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
1817 // The term "word" in HotSpot means a 32- or 64-bit machine word.
1818 int MacroAssembler::load_signed_short(Register dst, Address src) {
1819 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
1820 // version but this is what 64bit has always done. This seems to imply
1821 // that users are only using 32bits worth.
1822 int off = offset();
1823 movswl(dst, src); // movsxw
1824 return off;
1825 }
1826
1827 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1828 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1829 // and "3.9 Partial Register Penalties", p. 22).
1830 int off = offset();
1831 movzbl(dst, src); // movzxb
1832 return off;
1833 }
1834
1835 // Note: load_unsigned_short used to be called load_unsigned_word.
1836 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1837 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1838 // and "3.9 Partial Register Penalties", p. 22).
1839 int off = offset();
1840 movzwl(dst, src); // movzxw
1841 return off;
1842 }
1843
1844 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1845 switch (size_in_bytes) {
1846 case 8: movq(dst, src); break;
1847 case 4: movl(dst, src); break;
1848 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1849 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1850 default: ShouldNotReachHere();
1851 }
1852 }
1853
1854 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1855 switch (size_in_bytes) {
1856 case 8: movq(dst, src); break;
1857 case 4: movl(dst, src); break;
1858 case 2: movw(dst, src); break;
1859 case 1: movb(dst, src); break;
1860 default: ShouldNotReachHere();
1861 }
1862 }
1863
1864 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
1865 assert(rscratch != noreg || always_reachable(dst), "missing");
1866
1867 if (reachable(dst)) {
1868 movl(as_Address(dst), src);
1869 } else {
1870 lea(rscratch, dst);
1871 movl(Address(rscratch, 0), src);
1872 }
1873 }
1874
1875 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
1876 if (reachable(src)) {
1877 movl(dst, as_Address(src));
1878 } else {
1879 lea(dst, src);
1880 movl(dst, Address(dst, 0));
1881 }
1882 }
1883
1884 // C++ bool manipulation
1885
1886 void MacroAssembler::movbool(Register dst, Address src) {
1887 if(sizeof(bool) == 1)
1888 movb(dst, src);
1889 else if(sizeof(bool) == 2)
1890 movw(dst, src);
1891 else if(sizeof(bool) == 4)
1892 movl(dst, src);
1893 else
1894 // unsupported
1895 ShouldNotReachHere();
1896 }
1897
1898 void MacroAssembler::movbool(Address dst, bool boolconst) {
1899 if(sizeof(bool) == 1)
1900 movb(dst, (int) boolconst);
1901 else if(sizeof(bool) == 2)
1902 movw(dst, (int) boolconst);
1903 else if(sizeof(bool) == 4)
1904 movl(dst, (int) boolconst);
1905 else
1906 // unsupported
1907 ShouldNotReachHere();
1908 }
1909
1910 void MacroAssembler::movbool(Address dst, Register src) {
1911 if(sizeof(bool) == 1)
1912 movb(dst, src);
1913 else if(sizeof(bool) == 2)
1914 movw(dst, src);
1915 else if(sizeof(bool) == 4)
1916 movl(dst, src);
1917 else
1918 // unsupported
1919 ShouldNotReachHere();
1920 }
1921
1922 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1923 assert(rscratch != noreg || always_reachable(src), "missing");
1924
1925 if (reachable(src)) {
1926 movdl(dst, as_Address(src));
1927 } else {
1928 lea(rscratch, src);
1929 movdl(dst, Address(rscratch, 0));
1930 }
1931 }
1932
1933 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
1934 assert(rscratch != noreg || always_reachable(src), "missing");
1935
1936 if (reachable(src)) {
1937 movq(dst, as_Address(src));
1938 } else {
1939 lea(rscratch, src);
1940 movq(dst, Address(rscratch, 0));
1941 }
1942 }
1943
1944 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1945 assert(rscratch != noreg || always_reachable(src), "missing");
1946
1947 if (reachable(src)) {
1948 if (UseXmmLoadAndClearUpper) {
1949 movsd (dst, as_Address(src));
1950 } else {
1951 movlpd(dst, as_Address(src));
1952 }
1953 } else {
1954 lea(rscratch, src);
1955 if (UseXmmLoadAndClearUpper) {
1956 movsd (dst, Address(rscratch, 0));
1957 } else {
1958 movlpd(dst, Address(rscratch, 0));
1959 }
1960 }
1961 }
1962
1963 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
1964 assert(rscratch != noreg || always_reachable(src), "missing");
1965
1966 if (reachable(src)) {
1967 movss(dst, as_Address(src));
1968 } else {
1969 lea(rscratch, src);
1970 movss(dst, Address(rscratch, 0));
1971 }
1972 }
1973
1974 void MacroAssembler::movhlf(XMMRegister dst, XMMRegister src, Register rscratch) {
1975 if (VM_Version::supports_avx10_2()) {
1976 evmovw(dst, src);
1977 } else {
1978 assert(rscratch != noreg, "missing");
1979 evmovw(rscratch, src);
1980 evmovw(dst, rscratch);
1981 }
1982 }
1983
1984 void MacroAssembler::mov64(Register dst, int64_t imm64) {
1985 if (is_uimm32(imm64)) {
1986 movl(dst, checked_cast<uint32_t>(imm64));
1987 } else if (is_simm32(imm64)) {
1988 movq(dst, checked_cast<int32_t>(imm64));
1989 } else {
1990 Assembler::mov64(dst, imm64);
1991 }
1992 }
1993
1994 void MacroAssembler::mov64(Register dst, int64_t imm64, relocInfo::relocType rtype, int format) {
1995 Assembler::mov64(dst, imm64, rtype, format);
1996 }
1997
1998 void MacroAssembler::movptr(Register dst, Register src) {
1999 movq(dst, src);
2000 }
2001
2002 void MacroAssembler::movptr(Register dst, Address src) {
2003 movq(dst, src);
2004 }
2005
2006 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2007 void MacroAssembler::movptr(Register dst, intptr_t src) {
2008 mov64(dst, src);
2009 }
2010
2011 void MacroAssembler::movptr(Address dst, Register src) {
2012 movq(dst, src);
2013 }
2014
2015 void MacroAssembler::movptr(Address dst, int32_t src) {
2016 movslq(dst, src);
2017 }
2018
2019 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2020 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2021 Assembler::movdqu(dst, src);
2022 }
2023
2024 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2025 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2026 Assembler::movdqu(dst, src);
2027 }
2028
2029 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2030 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2031 Assembler::movdqu(dst, src);
2032 }
2033
2034 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2035 assert(rscratch != noreg || always_reachable(src), "missing");
2036
2037 if (reachable(src)) {
2038 movdqu(dst, as_Address(src));
2039 } else {
2040 lea(rscratch, src);
2041 movdqu(dst, Address(rscratch, 0));
2042 }
2043 }
2044
2045 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2046 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2047 Assembler::vmovdqu(dst, src);
2048 }
2049
2050 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2051 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2052 Assembler::vmovdqu(dst, src);
2053 }
2054
2055 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2056 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2057 Assembler::vmovdqu(dst, src);
2058 }
2059
2060 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2061 assert(rscratch != noreg || always_reachable(src), "missing");
2062
2063 if (reachable(src)) {
2064 vmovdqu(dst, as_Address(src));
2065 }
2066 else {
2067 lea(rscratch, src);
2068 vmovdqu(dst, Address(rscratch, 0));
2069 }
2070 }
2071
2072 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2073 assert(rscratch != noreg || always_reachable(src), "missing");
2074
2075 if (vector_len == AVX_512bit) {
2076 evmovdquq(dst, src, AVX_512bit, rscratch);
2077 } else if (vector_len == AVX_256bit) {
2078 vmovdqu(dst, src, rscratch);
2079 } else {
2080 movdqu(dst, src, rscratch);
2081 }
2082 }
2083
2084 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
2085 if (vector_len == AVX_512bit) {
2086 evmovdquq(dst, src, AVX_512bit);
2087 } else if (vector_len == AVX_256bit) {
2088 vmovdqu(dst, src);
2089 } else {
2090 movdqu(dst, src);
2091 }
2092 }
2093
2094 void MacroAssembler::vmovdqu(Address dst, XMMRegister src, int vector_len) {
2095 if (vector_len == AVX_512bit) {
2096 evmovdquq(dst, src, AVX_512bit);
2097 } else if (vector_len == AVX_256bit) {
2098 vmovdqu(dst, src);
2099 } else {
2100 movdqu(dst, src);
2101 }
2102 }
2103
2104 void MacroAssembler::vmovdqu(XMMRegister dst, Address src, int vector_len) {
2105 if (vector_len == AVX_512bit) {
2106 evmovdquq(dst, src, AVX_512bit);
2107 } else if (vector_len == AVX_256bit) {
2108 vmovdqu(dst, src);
2109 } else {
2110 movdqu(dst, src);
2111 }
2112 }
2113
2114 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2115 assert(rscratch != noreg || always_reachable(src), "missing");
2116
2117 if (reachable(src)) {
2118 vmovdqa(dst, as_Address(src));
2119 }
2120 else {
2121 lea(rscratch, src);
2122 vmovdqa(dst, Address(rscratch, 0));
2123 }
2124 }
2125
2126 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2127 assert(rscratch != noreg || always_reachable(src), "missing");
2128
2129 if (vector_len == AVX_512bit) {
2130 evmovdqaq(dst, src, AVX_512bit, rscratch);
2131 } else if (vector_len == AVX_256bit) {
2132 vmovdqa(dst, src, rscratch);
2133 } else {
2134 movdqa(dst, src, rscratch);
2135 }
2136 }
2137
2138 void MacroAssembler::kmov(KRegister dst, Address src) {
2139 if (VM_Version::supports_avx512bw()) {
2140 kmovql(dst, src);
2141 } else {
2142 assert(VM_Version::supports_evex(), "");
2143 kmovwl(dst, src);
2144 }
2145 }
2146
2147 void MacroAssembler::kmov(Address dst, KRegister src) {
2148 if (VM_Version::supports_avx512bw()) {
2149 kmovql(dst, src);
2150 } else {
2151 assert(VM_Version::supports_evex(), "");
2152 kmovwl(dst, src);
2153 }
2154 }
2155
2156 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2157 if (VM_Version::supports_avx512bw()) {
2158 kmovql(dst, src);
2159 } else {
2160 assert(VM_Version::supports_evex(), "");
2161 kmovwl(dst, src);
2162 }
2163 }
2164
2165 void MacroAssembler::kmov(Register dst, KRegister src) {
2166 if (VM_Version::supports_avx512bw()) {
2167 kmovql(dst, src);
2168 } else {
2169 assert(VM_Version::supports_evex(), "");
2170 kmovwl(dst, src);
2171 }
2172 }
2173
2174 void MacroAssembler::kmov(KRegister dst, Register src) {
2175 if (VM_Version::supports_avx512bw()) {
2176 kmovql(dst, src);
2177 } else {
2178 assert(VM_Version::supports_evex(), "");
2179 kmovwl(dst, src);
2180 }
2181 }
2182
2183 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2184 assert(rscratch != noreg || always_reachable(src), "missing");
2185
2186 if (reachable(src)) {
2187 kmovql(dst, as_Address(src));
2188 } else {
2189 lea(rscratch, src);
2190 kmovql(dst, Address(rscratch, 0));
2191 }
2192 }
2193
2194 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2195 assert(rscratch != noreg || always_reachable(src), "missing");
2196
2197 if (reachable(src)) {
2198 kmovwl(dst, as_Address(src));
2199 } else {
2200 lea(rscratch, src);
2201 kmovwl(dst, Address(rscratch, 0));
2202 }
2203 }
2204
2205 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2206 int vector_len, Register rscratch) {
2207 assert(rscratch != noreg || always_reachable(src), "missing");
2208
2209 if (reachable(src)) {
2210 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2211 } else {
2212 lea(rscratch, src);
2213 Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2214 }
2215 }
2216
2217 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2218 int vector_len, Register rscratch) {
2219 assert(rscratch != noreg || always_reachable(src), "missing");
2220
2221 if (reachable(src)) {
2222 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2223 } else {
2224 lea(rscratch, src);
2225 Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2226 }
2227 }
2228
2229 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2230 assert(rscratch != noreg || always_reachable(src), "missing");
2231
2232 if (reachable(src)) {
2233 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2234 } else {
2235 lea(rscratch, src);
2236 Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2237 }
2238 }
2239
2240 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2241 assert(rscratch != noreg || always_reachable(src), "missing");
2242
2243 if (reachable(src)) {
2244 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2245 } else {
2246 lea(rscratch, src);
2247 Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2248 }
2249 }
2250
2251 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2252 assert(rscratch != noreg || always_reachable(src), "missing");
2253
2254 if (reachable(src)) {
2255 Assembler::evmovdquq(dst, as_Address(src), vector_len);
2256 } else {
2257 lea(rscratch, src);
2258 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2259 }
2260 }
2261
2262 void MacroAssembler::evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2263 assert(rscratch != noreg || always_reachable(src), "missing");
2264
2265 if (reachable(src)) {
2266 Assembler::evmovdqaq(dst, mask, as_Address(src), merge, vector_len);
2267 } else {
2268 lea(rscratch, src);
2269 Assembler::evmovdqaq(dst, mask, Address(rscratch, 0), merge, vector_len);
2270 }
2271 }
2272
2273 void MacroAssembler::evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2274 assert(rscratch != noreg || always_reachable(src), "missing");
2275
2276 if (reachable(src)) {
2277 Assembler::evmovdqaq(dst, as_Address(src), vector_len);
2278 } else {
2279 lea(rscratch, src);
2280 Assembler::evmovdqaq(dst, Address(rscratch, 0), vector_len);
2281 }
2282 }
2283
2284 void MacroAssembler::movapd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2285 assert(rscratch != noreg || always_reachable(src), "missing");
2286
2287 if (reachable(src)) {
2288 Assembler::movapd(dst, as_Address(src));
2289 } else {
2290 lea(rscratch, src);
2291 Assembler::movapd(dst, Address(rscratch, 0));
2292 }
2293 }
2294
2295 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2296 assert(rscratch != noreg || always_reachable(src), "missing");
2297
2298 if (reachable(src)) {
2299 Assembler::movdqa(dst, as_Address(src));
2300 } else {
2301 lea(rscratch, src);
2302 Assembler::movdqa(dst, Address(rscratch, 0));
2303 }
2304 }
2305
2306 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2307 assert(rscratch != noreg || always_reachable(src), "missing");
2308
2309 if (reachable(src)) {
2310 Assembler::movsd(dst, as_Address(src));
2311 } else {
2312 lea(rscratch, src);
2313 Assembler::movsd(dst, Address(rscratch, 0));
2314 }
2315 }
2316
2317 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2318 assert(rscratch != noreg || always_reachable(src), "missing");
2319
2320 if (reachable(src)) {
2321 Assembler::movss(dst, as_Address(src));
2322 } else {
2323 lea(rscratch, src);
2324 Assembler::movss(dst, Address(rscratch, 0));
2325 }
2326 }
2327
2328 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2329 assert(rscratch != noreg || always_reachable(src), "missing");
2330
2331 if (reachable(src)) {
2332 Assembler::movddup(dst, as_Address(src));
2333 } else {
2334 lea(rscratch, src);
2335 Assembler::movddup(dst, Address(rscratch, 0));
2336 }
2337 }
2338
2339 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2340 assert(rscratch != noreg || always_reachable(src), "missing");
2341
2342 if (reachable(src)) {
2343 Assembler::vmovddup(dst, as_Address(src), vector_len);
2344 } else {
2345 lea(rscratch, src);
2346 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2347 }
2348 }
2349
2350 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2351 assert(rscratch != noreg || always_reachable(src), "missing");
2352
2353 if (reachable(src)) {
2354 Assembler::mulsd(dst, as_Address(src));
2355 } else {
2356 lea(rscratch, src);
2357 Assembler::mulsd(dst, Address(rscratch, 0));
2358 }
2359 }
2360
2361 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2362 assert(rscratch != noreg || always_reachable(src), "missing");
2363
2364 if (reachable(src)) {
2365 Assembler::mulss(dst, as_Address(src));
2366 } else {
2367 lea(rscratch, src);
2368 Assembler::mulss(dst, Address(rscratch, 0));
2369 }
2370 }
2371
2372 void MacroAssembler::null_check(Register reg, int offset) {
2373 if (needs_explicit_null_check(offset)) {
2374 // provoke OS null exception if reg is null by
2375 // accessing M[reg] w/o changing any (non-CC) registers
2376 // NOTE: cmpl is plenty here to provoke a segv
2377 cmpptr(rax, Address(reg, 0));
2378 // Note: should probably use testl(rax, Address(reg, 0));
2379 // may be shorter code (however, this version of
2380 // testl needs to be implemented first)
2381 } else {
2382 // nothing to do, (later) access of M[reg + offset]
2383 // will provoke OS null exception if reg is null
2384 }
2385 }
2386
2387 void MacroAssembler::test_markword_is_inline_type(Register markword, Label& is_inline_type) {
2388 andptr(markword, markWord::inline_type_pattern_mask);
2389 cmpptr(markword, markWord::inline_type_pattern);
2390 jcc(Assembler::equal, is_inline_type);
2391 }
2392
2393 void MacroAssembler::test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type, bool can_be_null) {
2394 if (can_be_null) {
2395 testptr(object, object);
2396 jcc(Assembler::zero, not_inline_type);
2397 }
2398 const int is_inline_type_mask = markWord::inline_type_pattern;
2399 movptr(tmp, Address(object, oopDesc::mark_offset_in_bytes()));
2400 andptr(tmp, is_inline_type_mask);
2401 cmpptr(tmp, is_inline_type_mask);
2402 jcc(Assembler::notEqual, not_inline_type);
2403 }
2404
2405 void MacroAssembler::test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free_inline_type) {
2406 movl(temp_reg, flags);
2407 testl(temp_reg, 1 << ResolvedFieldEntry::is_null_free_inline_type_shift);
2408 jcc(Assembler::notEqual, is_null_free_inline_type);
2409 }
2410
2411 void MacroAssembler::test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free_inline_type) {
2412 movl(temp_reg, flags);
2413 testl(temp_reg, 1 << ResolvedFieldEntry::is_null_free_inline_type_shift);
2414 jcc(Assembler::equal, not_null_free_inline_type);
2415 }
2416
2417 void MacroAssembler::test_field_is_flat(Register flags, Register temp_reg, Label& is_flat) {
2418 movl(temp_reg, flags);
2419 testl(temp_reg, 1 << ResolvedFieldEntry::is_flat_shift);
2420 jcc(Assembler::notEqual, is_flat);
2421 }
2422
2423 void MacroAssembler::test_field_has_null_marker(Register flags, Register temp_reg, Label& has_null_marker) {
2424 movl(temp_reg, flags);
2425 testl(temp_reg, 1 << ResolvedFieldEntry::has_null_marker_shift);
2426 jcc(Assembler::notEqual, has_null_marker);
2427 }
2428
2429 void MacroAssembler::test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set, Label& jmp_label) {
2430 Label test_mark_word;
2431 // load mark word
2432 movptr(temp_reg, Address(oop, oopDesc::mark_offset_in_bytes()));
2433 // check displaced
2434 testl(temp_reg, markWord::unlocked_value);
2435 jccb(Assembler::notZero, test_mark_word);
2436 // slow path use klass prototype
2437 push(rscratch1);
2438 load_prototype_header(temp_reg, oop, rscratch1);
2439 pop(rscratch1);
2440
2441 bind(test_mark_word);
2442 testl(temp_reg, test_bit);
2443 jcc((jmp_set) ? Assembler::notZero : Assembler::zero, jmp_label);
2444 }
2445
2446 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg,
2447 Label& is_flat_array) {
2448 #ifdef _LP64
2449 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, true, is_flat_array);
2450 #else
2451 load_klass(temp_reg, oop, noreg);
2452 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset()));
2453 test_flat_array_layout(temp_reg, is_flat_array);
2454 #endif
2455 }
2456
2457 void MacroAssembler::test_non_flat_array_oop(Register oop, Register temp_reg,
2458 Label& is_non_flat_array) {
2459 #ifdef _LP64
2460 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, false, is_non_flat_array);
2461 #else
2462 load_klass(temp_reg, oop, noreg);
2463 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset()));
2464 test_non_flat_array_layout(temp_reg, is_non_flat_array);
2465 #endif
2466 }
2467
2468 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label&is_null_free_array) {
2469 #ifdef _LP64
2470 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, true, is_null_free_array);
2471 #else
2472 Unimplemented();
2473 #endif
2474 }
2475
2476 void MacroAssembler::test_non_null_free_array_oop(Register oop, Register temp_reg, Label&is_non_null_free_array) {
2477 #ifdef _LP64
2478 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, false, is_non_null_free_array);
2479 #else
2480 Unimplemented();
2481 #endif
2482 }
2483
2484 void MacroAssembler::test_flat_array_layout(Register lh, Label& is_flat_array) {
2485 testl(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2486 jcc(Assembler::notZero, is_flat_array);
2487 }
2488
2489 void MacroAssembler::test_non_flat_array_layout(Register lh, Label& is_non_flat_array) {
2490 testl(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2491 jcc(Assembler::zero, is_non_flat_array);
2492 }
2493
2494 void MacroAssembler::os_breakpoint() {
2495 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2496 // (e.g., MSVC can't call ps() otherwise)
2497 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2498 }
2499
2500 void MacroAssembler::unimplemented(const char* what) {
2501 const char* buf = nullptr;
2502 {
2503 ResourceMark rm;
2504 stringStream ss;
2505 ss.print("unimplemented: %s", what);
2506 buf = code_string(ss.as_string());
2507 }
2508 stop(buf);
2509 }
2510
2511 #define XSTATE_BV 0x200
2512
2513 void MacroAssembler::pop_CPU_state() {
2514 pop_FPU_state();
2515 pop_IU_state();
2516 }
2517
2518 void MacroAssembler::pop_FPU_state() {
2519 fxrstor(Address(rsp, 0));
2520 addptr(rsp, FPUStateSizeInWords * wordSize);
2521 }
2522
2523 void MacroAssembler::pop_IU_state() {
2524 popa();
2525 addq(rsp, 8);
2526 popf();
2527 }
2528
2529 // Save Integer and Float state
2530 // Warning: Stack must be 16 byte aligned (64bit)
2531 void MacroAssembler::push_CPU_state() {
2532 push_IU_state();
2533 push_FPU_state();
2534 }
2535
2536 void MacroAssembler::push_FPU_state() {
2537 subptr(rsp, FPUStateSizeInWords * wordSize);
2538 fxsave(Address(rsp, 0));
2539 }
2540
2541 void MacroAssembler::push_IU_state() {
2542 // Push flags first because pusha kills them
2543 pushf();
2544 // Make sure rsp stays 16-byte aligned
2545 subq(rsp, 8);
2546 pusha();
2547 }
2548
2549 void MacroAssembler::push_cont_fastpath() {
2550 if (!Continuations::enabled()) return;
2551
2552 Label L_done;
2553 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2554 jccb(Assembler::belowEqual, L_done);
2555 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rsp);
2556 bind(L_done);
2557 }
2558
2559 void MacroAssembler::pop_cont_fastpath() {
2560 if (!Continuations::enabled()) return;
2561
2562 Label L_done;
2563 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2564 jccb(Assembler::below, L_done);
2565 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
2566 bind(L_done);
2567 }
2568
2569 #ifdef ASSERT
2570 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
2571 Label no_cont;
2572 movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
2573 testl(cont, cont);
2574 jcc(Assembler::zero, no_cont);
2575 stop(name);
2576 bind(no_cont);
2577 }
2578 #endif
2579
2580 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { // determine java_thread register
2581 // we must set sp to zero to clear frame
2582 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2583 // must clear fp, so that compiled frames are not confused; it is
2584 // possible that we need it only for debugging
2585 if (clear_fp) {
2586 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2587 }
2588 // Always clear the pc because it could have been set by make_walkable()
2589 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2590 vzeroupper();
2591 }
2592
2593 void MacroAssembler::round_to(Register reg, int modulus) {
2594 addptr(reg, modulus - 1);
2595 andptr(reg, -modulus);
2596 }
2597
2598 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod) {
2599 if (at_return) {
2600 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2601 // we may safely use rsp instead to perform the stack watermark check.
2602 cmpptr(in_nmethod ? rsp : rbp, Address(r15_thread, JavaThread::polling_word_offset()));
2603 jcc(Assembler::above, slow_path);
2604 return;
2605 }
2606 testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2607 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2608 }
2609
2610 // Calls to C land
2611 //
2612 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2613 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2614 // has to be reset to 0. This is required to allow proper stack traversal.
2615 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2616 Register last_java_fp,
2617 address last_java_pc,
2618 Register rscratch) {
2619 vzeroupper();
2620 // determine last_java_sp register
2621 if (!last_java_sp->is_valid()) {
2622 last_java_sp = rsp;
2623 }
2624 // last_java_fp is optional
2625 if (last_java_fp->is_valid()) {
2626 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2627 }
2628 // last_java_pc is optional
2629 if (last_java_pc != nullptr) {
2630 Address java_pc(r15_thread,
2631 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
2632 lea(java_pc, InternalAddress(last_java_pc), rscratch);
2633 }
2634 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2635 }
2636
2637 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2638 Register last_java_fp,
2639 Label &L,
2640 Register scratch) {
2641 lea(scratch, L);
2642 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), scratch);
2643 set_last_Java_frame(last_java_sp, last_java_fp, nullptr, scratch);
2644 }
2645
2646 void MacroAssembler::shlptr(Register dst, int imm8) {
2647 shlq(dst, imm8);
2648 }
2649
2650 void MacroAssembler::shrptr(Register dst, int imm8) {
2651 shrq(dst, imm8);
2652 }
2653
2654 void MacroAssembler::sign_extend_byte(Register reg) {
2655 movsbl(reg, reg); // movsxb
2656 }
2657
2658 void MacroAssembler::sign_extend_short(Register reg) {
2659 movswl(reg, reg); // movsxw
2660 }
2661
2662 void MacroAssembler::testl(Address dst, int32_t imm32) {
2663 if (imm32 >= 0 && is8bit(imm32)) {
2664 testb(dst, imm32);
2665 } else {
2666 Assembler::testl(dst, imm32);
2667 }
2668 }
2669
2670 void MacroAssembler::testl(Register dst, int32_t imm32) {
2671 if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
2672 testb(dst, imm32);
2673 } else {
2674 Assembler::testl(dst, imm32);
2675 }
2676 }
2677
2678 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2679 assert(always_reachable(src), "Address should be reachable");
2680 testl(dst, as_Address(src));
2681 }
2682
2683 void MacroAssembler::testq(Address dst, int32_t imm32) {
2684 if (imm32 >= 0) {
2685 testl(dst, imm32);
2686 } else {
2687 Assembler::testq(dst, imm32);
2688 }
2689 }
2690
2691 void MacroAssembler::testq(Register dst, int32_t imm32) {
2692 if (imm32 >= 0) {
2693 testl(dst, imm32);
2694 } else {
2695 Assembler::testq(dst, imm32);
2696 }
2697 }
2698
2699 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2700 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2701 Assembler::pcmpeqb(dst, src);
2702 }
2703
2704 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2705 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2706 Assembler::pcmpeqw(dst, src);
2707 }
2708
2709 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2710 assert((dst->encoding() < 16),"XMM register should be 0-15");
2711 Assembler::pcmpestri(dst, src, imm8);
2712 }
2713
2714 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2715 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2716 Assembler::pcmpestri(dst, src, imm8);
2717 }
2718
2719 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2720 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2721 Assembler::pmovzxbw(dst, src);
2722 }
2723
2724 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2725 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2726 Assembler::pmovzxbw(dst, src);
2727 }
2728
2729 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2730 assert((src->encoding() < 16),"XMM register should be 0-15");
2731 Assembler::pmovmskb(dst, src);
2732 }
2733
2734 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2735 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2736 Assembler::ptest(dst, src);
2737 }
2738
2739 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2740 assert(rscratch != noreg || always_reachable(src), "missing");
2741
2742 if (reachable(src)) {
2743 Assembler::sqrtss(dst, as_Address(src));
2744 } else {
2745 lea(rscratch, src);
2746 Assembler::sqrtss(dst, Address(rscratch, 0));
2747 }
2748 }
2749
2750 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2751 assert(rscratch != noreg || always_reachable(src), "missing");
2752
2753 if (reachable(src)) {
2754 Assembler::subsd(dst, as_Address(src));
2755 } else {
2756 lea(rscratch, src);
2757 Assembler::subsd(dst, Address(rscratch, 0));
2758 }
2759 }
2760
2761 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
2762 assert(rscratch != noreg || always_reachable(src), "missing");
2763
2764 if (reachable(src)) {
2765 Assembler::roundsd(dst, as_Address(src), rmode);
2766 } else {
2767 lea(rscratch, src);
2768 Assembler::roundsd(dst, Address(rscratch, 0), rmode);
2769 }
2770 }
2771
2772 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2773 assert(rscratch != noreg || always_reachable(src), "missing");
2774
2775 if (reachable(src)) {
2776 Assembler::subss(dst, as_Address(src));
2777 } else {
2778 lea(rscratch, src);
2779 Assembler::subss(dst, Address(rscratch, 0));
2780 }
2781 }
2782
2783 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2784 assert(rscratch != noreg || always_reachable(src), "missing");
2785
2786 if (reachable(src)) {
2787 Assembler::ucomisd(dst, as_Address(src));
2788 } else {
2789 lea(rscratch, src);
2790 Assembler::ucomisd(dst, Address(rscratch, 0));
2791 }
2792 }
2793
2794 void MacroAssembler::evucomxsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2795 assert(rscratch != noreg || always_reachable(src), "missing");
2796
2797 if (reachable(src)) {
2798 Assembler::evucomxsd(dst, as_Address(src));
2799 } else {
2800 lea(rscratch, src);
2801 Assembler::evucomxsd(dst, Address(rscratch, 0));
2802 }
2803 }
2804
2805 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2806 assert(rscratch != noreg || always_reachable(src), "missing");
2807
2808 if (reachable(src)) {
2809 Assembler::ucomiss(dst, as_Address(src));
2810 } else {
2811 lea(rscratch, src);
2812 Assembler::ucomiss(dst, Address(rscratch, 0));
2813 }
2814 }
2815
2816 void MacroAssembler::evucomxss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2817 assert(rscratch != noreg || always_reachable(src), "missing");
2818
2819 if (reachable(src)) {
2820 Assembler::evucomxss(dst, as_Address(src));
2821 } else {
2822 lea(rscratch, src);
2823 Assembler::evucomxss(dst, Address(rscratch, 0));
2824 }
2825 }
2826
2827 void MacroAssembler::evucomish(XMMRegister dst, AddressLiteral src, Register rscratch) {
2828 assert(rscratch != noreg || always_reachable(src), "missing");
2829
2830 if (reachable(src)) {
2831 Assembler::evucomish(dst, as_Address(src));
2832 } else {
2833 lea(rscratch, src);
2834 Assembler::evucomish(dst, Address(rscratch, 0));
2835 }
2836 }
2837
2838 void MacroAssembler::evucomxsh(XMMRegister dst, AddressLiteral src, Register rscratch) {
2839 assert(rscratch != noreg || always_reachable(src), "missing");
2840
2841 if (reachable(src)) {
2842 Assembler::evucomxsh(dst, as_Address(src));
2843 } else {
2844 lea(rscratch, src);
2845 Assembler::evucomxsh(dst, Address(rscratch, 0));
2846 }
2847 }
2848
2849 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2850 assert(rscratch != noreg || always_reachable(src), "missing");
2851
2852 // Used in sign-bit flipping with aligned address.
2853 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2854
2855 if (UseAVX > 2 &&
2856 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2857 (dst->encoding() >= 16)) {
2858 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2859 } else if (reachable(src)) {
2860 Assembler::xorpd(dst, as_Address(src));
2861 } else {
2862 lea(rscratch, src);
2863 Assembler::xorpd(dst, Address(rscratch, 0));
2864 }
2865 }
2866
2867 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
2868 if (UseAVX > 2 &&
2869 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2870 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2871 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2872 } else {
2873 Assembler::xorpd(dst, src);
2874 }
2875 }
2876
2877 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
2878 if (UseAVX > 2 &&
2879 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2880 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2881 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2882 } else {
2883 Assembler::xorps(dst, src);
2884 }
2885 }
2886
2887 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
2888 assert(rscratch != noreg || always_reachable(src), "missing");
2889
2890 // Used in sign-bit flipping with aligned address.
2891 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2892
2893 if (UseAVX > 2 &&
2894 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2895 (dst->encoding() >= 16)) {
2896 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2897 } else if (reachable(src)) {
2898 Assembler::xorps(dst, as_Address(src));
2899 } else {
2900 lea(rscratch, src);
2901 Assembler::xorps(dst, Address(rscratch, 0));
2902 }
2903 }
2904
2905 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
2906 assert(rscratch != noreg || always_reachable(src), "missing");
2907
2908 // Used in sign-bit flipping with aligned address.
2909 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
2910 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
2911 if (reachable(src)) {
2912 Assembler::pshufb(dst, as_Address(src));
2913 } else {
2914 lea(rscratch, src);
2915 Assembler::pshufb(dst, Address(rscratch, 0));
2916 }
2917 }
2918
2919 // AVX 3-operands instructions
2920
2921 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2922 assert(rscratch != noreg || always_reachable(src), "missing");
2923
2924 if (reachable(src)) {
2925 vaddsd(dst, nds, as_Address(src));
2926 } else {
2927 lea(rscratch, src);
2928 vaddsd(dst, nds, Address(rscratch, 0));
2929 }
2930 }
2931
2932 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2933 assert(rscratch != noreg || always_reachable(src), "missing");
2934
2935 if (reachable(src)) {
2936 vaddss(dst, nds, as_Address(src));
2937 } else {
2938 lea(rscratch, src);
2939 vaddss(dst, nds, Address(rscratch, 0));
2940 }
2941 }
2942
2943 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2944 assert(UseAVX > 0, "requires some form of AVX");
2945 assert(rscratch != noreg || always_reachable(src), "missing");
2946
2947 if (reachable(src)) {
2948 Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
2949 } else {
2950 lea(rscratch, src);
2951 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
2952 }
2953 }
2954
2955 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2956 assert(UseAVX > 0, "requires some form of AVX");
2957 assert(rscratch != noreg || always_reachable(src), "missing");
2958
2959 if (reachable(src)) {
2960 Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
2961 } else {
2962 lea(rscratch, src);
2963 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
2964 }
2965 }
2966
2967 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2968 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2969 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2970
2971 vandps(dst, nds, negate_field, vector_len, rscratch);
2972 }
2973
2974 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2975 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2976 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2977
2978 vandpd(dst, nds, negate_field, vector_len, rscratch);
2979 }
2980
2981 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2982 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2983 Assembler::vpaddb(dst, nds, src, vector_len);
2984 }
2985
2986 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2987 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2988 Assembler::vpaddb(dst, nds, src, vector_len);
2989 }
2990
2991 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2992 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2993 Assembler::vpaddw(dst, nds, src, vector_len);
2994 }
2995
2996 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2997 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2998 Assembler::vpaddw(dst, nds, src, vector_len);
2999 }
3000
3001 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3002 assert(rscratch != noreg || always_reachable(src), "missing");
3003
3004 if (reachable(src)) {
3005 Assembler::vpand(dst, nds, as_Address(src), vector_len);
3006 } else {
3007 lea(rscratch, src);
3008 Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
3009 }
3010 }
3011
3012 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3013 assert(rscratch != noreg || always_reachable(src), "missing");
3014
3015 if (reachable(src)) {
3016 Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
3017 } else {
3018 lea(rscratch, src);
3019 Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
3020 }
3021 }
3022
3023 void MacroAssembler::vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3024 assert(rscratch != noreg || always_reachable(src), "missing");
3025
3026 if (reachable(src)) {
3027 Assembler::vbroadcasti128(dst, as_Address(src), vector_len);
3028 } else {
3029 lea(rscratch, src);
3030 Assembler::vbroadcasti128(dst, Address(rscratch, 0), vector_len);
3031 }
3032 }
3033
3034 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3035 assert(rscratch != noreg || always_reachable(src), "missing");
3036
3037 if (reachable(src)) {
3038 Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
3039 } else {
3040 lea(rscratch, src);
3041 Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
3042 }
3043 }
3044
3045 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3046 assert(rscratch != noreg || always_reachable(src), "missing");
3047
3048 if (reachable(src)) {
3049 Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3050 } else {
3051 lea(rscratch, src);
3052 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3053 }
3054 }
3055
3056 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3057 assert(rscratch != noreg || always_reachable(src), "missing");
3058
3059 if (reachable(src)) {
3060 Assembler::vbroadcastss(dst, as_Address(src), vector_len);
3061 } else {
3062 lea(rscratch, src);
3063 Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
3064 }
3065 }
3066
3067 // Vector float blend
3068 // vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
3069 void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
3070 // WARN: Allow dst == (src1|src2), mask == scratch
3071 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
3072 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
3073 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst;
3074 bool dst_available = dst != mask && (dst != src1 || dst != src2);
3075 if (blend_emulation && scratch_available && dst_available) {
3076 if (compute_mask) {
3077 vpsrad(scratch, mask, 32, vector_len);
3078 mask = scratch;
3079 }
3080 if (dst == src1) {
3081 vpandn(dst, mask, src1, vector_len); // if mask == 0, src1
3082 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3083 } else {
3084 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
3085 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1
3086 }
3087 vpor(dst, dst, scratch, vector_len);
3088 } else {
3089 Assembler::vblendvps(dst, src1, src2, mask, vector_len);
3090 }
3091 }
3092
3093 // vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
3094 void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
3095 // WARN: Allow dst == (src1|src2), mask == scratch
3096 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
3097 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
3098 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask);
3099 bool dst_available = dst != mask && (dst != src1 || dst != src2);
3100 if (blend_emulation && scratch_available && dst_available) {
3101 if (compute_mask) {
3102 vpxor(scratch, scratch, scratch, vector_len);
3103 vpcmpgtq(scratch, scratch, mask, vector_len);
3104 mask = scratch;
3105 }
3106 if (dst == src1) {
3107 vpandn(dst, mask, src1, vector_len); // if mask == 0, src
3108 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3109 } else {
3110 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
3111 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src
3112 }
3113 vpor(dst, dst, scratch, vector_len);
3114 } else {
3115 Assembler::vblendvpd(dst, src1, src2, mask, vector_len);
3116 }
3117 }
3118
3119 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3120 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3121 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3122 }
3123
3124 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
3125 assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3126 Assembler::vpcmpeqb(dst, src1, src2, vector_len);
3127 }
3128
3129 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3130 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3131 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3132 }
3133
3134 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3135 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3136 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3137 }
3138
3139 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3140 assert(rscratch != noreg || always_reachable(src), "missing");
3141
3142 if (reachable(src)) {
3143 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3144 } else {
3145 lea(rscratch, src);
3146 Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3147 }
3148 }
3149
3150 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3151 int comparison, bool is_signed, int vector_len, Register rscratch) {
3152 assert(rscratch != noreg || always_reachable(src), "missing");
3153
3154 if (reachable(src)) {
3155 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3156 } else {
3157 lea(rscratch, src);
3158 Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3159 }
3160 }
3161
3162 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3163 int comparison, bool is_signed, int vector_len, Register rscratch) {
3164 assert(rscratch != noreg || always_reachable(src), "missing");
3165
3166 if (reachable(src)) {
3167 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3168 } else {
3169 lea(rscratch, src);
3170 Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3171 }
3172 }
3173
3174 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3175 int comparison, bool is_signed, int vector_len, Register rscratch) {
3176 assert(rscratch != noreg || always_reachable(src), "missing");
3177
3178 if (reachable(src)) {
3179 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3180 } else {
3181 lea(rscratch, src);
3182 Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3183 }
3184 }
3185
3186 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3187 int comparison, bool is_signed, int vector_len, Register rscratch) {
3188 assert(rscratch != noreg || always_reachable(src), "missing");
3189
3190 if (reachable(src)) {
3191 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3192 } else {
3193 lea(rscratch, src);
3194 Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3195 }
3196 }
3197
3198 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3199 if (width == Assembler::Q) {
3200 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3201 } else {
3202 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3203 }
3204 }
3205
3206 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3207 int eq_cond_enc = 0x29;
3208 int gt_cond_enc = 0x37;
3209 if (width != Assembler::Q) {
3210 eq_cond_enc = 0x74 + width;
3211 gt_cond_enc = 0x64 + width;
3212 }
3213 switch (cond) {
3214 case eq:
3215 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3216 break;
3217 case neq:
3218 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3219 vallones(xtmp, vector_len);
3220 vpxor(dst, xtmp, dst, vector_len);
3221 break;
3222 case le:
3223 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3224 vallones(xtmp, vector_len);
3225 vpxor(dst, xtmp, dst, vector_len);
3226 break;
3227 case nlt:
3228 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3229 vallones(xtmp, vector_len);
3230 vpxor(dst, xtmp, dst, vector_len);
3231 break;
3232 case lt:
3233 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3234 break;
3235 case nle:
3236 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3237 break;
3238 default:
3239 assert(false, "Should not reach here");
3240 }
3241 }
3242
3243 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3244 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3245 Assembler::vpmovzxbw(dst, src, vector_len);
3246 }
3247
3248 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3249 assert((src->encoding() < 16),"XMM register should be 0-15");
3250 Assembler::vpmovmskb(dst, src, vector_len);
3251 }
3252
3253 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3254 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3255 Assembler::vpmullw(dst, nds, src, vector_len);
3256 }
3257
3258 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3259 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3260 Assembler::vpmullw(dst, nds, src, vector_len);
3261 }
3262
3263 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3264 assert((UseAVX > 0), "AVX support is needed");
3265 assert(rscratch != noreg || always_reachable(src), "missing");
3266
3267 if (reachable(src)) {
3268 Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3269 } else {
3270 lea(rscratch, src);
3271 Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3272 }
3273 }
3274
3275 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3276 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3277 Assembler::vpsubb(dst, nds, src, vector_len);
3278 }
3279
3280 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3281 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3282 Assembler::vpsubb(dst, nds, src, vector_len);
3283 }
3284
3285 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3286 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3287 Assembler::vpsubw(dst, nds, src, vector_len);
3288 }
3289
3290 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3291 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3292 Assembler::vpsubw(dst, nds, src, vector_len);
3293 }
3294
3295 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3296 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3297 Assembler::vpsraw(dst, nds, shift, vector_len);
3298 }
3299
3300 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3301 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3302 Assembler::vpsraw(dst, nds, shift, vector_len);
3303 }
3304
3305 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3306 assert(UseAVX > 2,"");
3307 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3308 vector_len = 2;
3309 }
3310 Assembler::evpsraq(dst, nds, shift, vector_len);
3311 }
3312
3313 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3314 assert(UseAVX > 2,"");
3315 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3316 vector_len = 2;
3317 }
3318 Assembler::evpsraq(dst, nds, shift, vector_len);
3319 }
3320
3321 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3322 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3323 Assembler::vpsrlw(dst, nds, shift, vector_len);
3324 }
3325
3326 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3327 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3328 Assembler::vpsrlw(dst, nds, shift, vector_len);
3329 }
3330
3331 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3332 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3333 Assembler::vpsllw(dst, nds, shift, vector_len);
3334 }
3335
3336 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3337 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3338 Assembler::vpsllw(dst, nds, shift, vector_len);
3339 }
3340
3341 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3342 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3343 Assembler::vptest(dst, src);
3344 }
3345
3346 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3347 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3348 Assembler::punpcklbw(dst, src);
3349 }
3350
3351 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3352 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3353 Assembler::pshufd(dst, src, mode);
3354 }
3355
3356 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3357 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3358 Assembler::pshuflw(dst, src, mode);
3359 }
3360
3361 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3362 assert(rscratch != noreg || always_reachable(src), "missing");
3363
3364 if (reachable(src)) {
3365 vandpd(dst, nds, as_Address(src), vector_len);
3366 } else {
3367 lea(rscratch, src);
3368 vandpd(dst, nds, Address(rscratch, 0), vector_len);
3369 }
3370 }
3371
3372 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3373 assert(rscratch != noreg || always_reachable(src), "missing");
3374
3375 if (reachable(src)) {
3376 vandps(dst, nds, as_Address(src), vector_len);
3377 } else {
3378 lea(rscratch, src);
3379 vandps(dst, nds, Address(rscratch, 0), vector_len);
3380 }
3381 }
3382
3383 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3384 bool merge, int vector_len, Register rscratch) {
3385 assert(rscratch != noreg || always_reachable(src), "missing");
3386
3387 if (reachable(src)) {
3388 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3389 } else {
3390 lea(rscratch, src);
3391 Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3392 }
3393 }
3394
3395 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3396 assert(rscratch != noreg || always_reachable(src), "missing");
3397
3398 if (reachable(src)) {
3399 vdivsd(dst, nds, as_Address(src));
3400 } else {
3401 lea(rscratch, src);
3402 vdivsd(dst, nds, Address(rscratch, 0));
3403 }
3404 }
3405
3406 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3407 assert(rscratch != noreg || always_reachable(src), "missing");
3408
3409 if (reachable(src)) {
3410 vdivss(dst, nds, as_Address(src));
3411 } else {
3412 lea(rscratch, src);
3413 vdivss(dst, nds, Address(rscratch, 0));
3414 }
3415 }
3416
3417 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3418 assert(rscratch != noreg || always_reachable(src), "missing");
3419
3420 if (reachable(src)) {
3421 vmulsd(dst, nds, as_Address(src));
3422 } else {
3423 lea(rscratch, src);
3424 vmulsd(dst, nds, Address(rscratch, 0));
3425 }
3426 }
3427
3428 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3429 assert(rscratch != noreg || always_reachable(src), "missing");
3430
3431 if (reachable(src)) {
3432 vmulss(dst, nds, as_Address(src));
3433 } else {
3434 lea(rscratch, src);
3435 vmulss(dst, nds, Address(rscratch, 0));
3436 }
3437 }
3438
3439 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3440 assert(rscratch != noreg || always_reachable(src), "missing");
3441
3442 if (reachable(src)) {
3443 vsubsd(dst, nds, as_Address(src));
3444 } else {
3445 lea(rscratch, src);
3446 vsubsd(dst, nds, Address(rscratch, 0));
3447 }
3448 }
3449
3450 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3451 assert(rscratch != noreg || always_reachable(src), "missing");
3452
3453 if (reachable(src)) {
3454 vsubss(dst, nds, as_Address(src));
3455 } else {
3456 lea(rscratch, src);
3457 vsubss(dst, nds, Address(rscratch, 0));
3458 }
3459 }
3460
3461 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3462 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3463 assert(rscratch != noreg || always_reachable(src), "missing");
3464
3465 vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3466 }
3467
3468 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3469 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3470 assert(rscratch != noreg || always_reachable(src), "missing");
3471
3472 vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3473 }
3474
3475 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3476 assert(rscratch != noreg || always_reachable(src), "missing");
3477
3478 if (reachable(src)) {
3479 vxorpd(dst, nds, as_Address(src), vector_len);
3480 } else {
3481 lea(rscratch, src);
3482 vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3483 }
3484 }
3485
3486 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3487 assert(rscratch != noreg || always_reachable(src), "missing");
3488
3489 if (reachable(src)) {
3490 vxorps(dst, nds, as_Address(src), vector_len);
3491 } else {
3492 lea(rscratch, src);
3493 vxorps(dst, nds, Address(rscratch, 0), vector_len);
3494 }
3495 }
3496
3497 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3498 assert(rscratch != noreg || always_reachable(src), "missing");
3499
3500 if (UseAVX > 1 || (vector_len < 1)) {
3501 if (reachable(src)) {
3502 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3503 } else {
3504 lea(rscratch, src);
3505 Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3506 }
3507 } else {
3508 MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3509 }
3510 }
3511
3512 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3513 assert(rscratch != noreg || always_reachable(src), "missing");
3514
3515 if (reachable(src)) {
3516 Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3517 } else {
3518 lea(rscratch, src);
3519 Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3520 }
3521 }
3522
3523 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3524 const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3525 STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3526 // The inverted mask is sign-extended
3527 andptr(possibly_non_local, inverted_mask);
3528 }
3529
3530 void MacroAssembler::resolve_jobject(Register value,
3531 Register tmp) {
3532 Register thread = r15_thread;
3533 assert_different_registers(value, thread, tmp);
3534 Label done, tagged, weak_tagged;
3535 testptr(value, value);
3536 jcc(Assembler::zero, done); // Use null as-is.
3537 testptr(value, JNIHandles::tag_mask); // Test for tag.
3538 jcc(Assembler::notZero, tagged);
3539
3540 // Resolve local handle
3541 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp);
3542 verify_oop(value);
3543 jmp(done);
3544
3545 bind(tagged);
3546 testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3547 jcc(Assembler::notZero, weak_tagged);
3548
3549 // Resolve global handle
3550 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3551 verify_oop(value);
3552 jmp(done);
3553
3554 bind(weak_tagged);
3555 // Resolve jweak.
3556 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3557 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp);
3558 verify_oop(value);
3559
3560 bind(done);
3561 }
3562
3563 void MacroAssembler::resolve_global_jobject(Register value,
3564 Register tmp) {
3565 Register thread = r15_thread;
3566 assert_different_registers(value, thread, tmp);
3567 Label done;
3568
3569 testptr(value, value);
3570 jcc(Assembler::zero, done); // Use null as-is.
3571
3572 #ifdef ASSERT
3573 {
3574 Label valid_global_tag;
3575 testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
3576 jcc(Assembler::notZero, valid_global_tag);
3577 stop("non global jobject using resolve_global_jobject");
3578 bind(valid_global_tag);
3579 }
3580 #endif
3581
3582 // Resolve global handle
3583 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3584 verify_oop(value);
3585
3586 bind(done);
3587 }
3588
3589 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3590 subq(dst, imm32);
3591 }
3592
3593 // Force generation of a 4 byte immediate value even if it fits into 8bit
3594 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3595 subq_imm32(dst, imm32);
3596 }
3597
3598 void MacroAssembler::subptr(Register dst, Register src) {
3599 subq(dst, src);
3600 }
3601
3602 // C++ bool manipulation
3603 void MacroAssembler::testbool(Register dst) {
3604 if(sizeof(bool) == 1)
3605 testb(dst, 0xff);
3606 else if(sizeof(bool) == 2) {
3607 // testw implementation needed for two byte bools
3608 ShouldNotReachHere();
3609 } else if(sizeof(bool) == 4)
3610 testl(dst, dst);
3611 else
3612 // unsupported
3613 ShouldNotReachHere();
3614 }
3615
3616 void MacroAssembler::testptr(Register dst, Register src) {
3617 testq(dst, src);
3618 }
3619
3620 // Object / value buffer allocation...
3621 //
3622 // Kills klass and rsi on LP64
3623 void MacroAssembler::allocate_instance(Register klass, Register new_obj,
3624 Register t1, Register t2,
3625 bool clear_fields, Label& alloc_failed)
3626 {
3627 Label done, initialize_header, initialize_object, slow_case, slow_case_no_pop;
3628 Register layout_size = t1;
3629 assert(new_obj == rax, "needs to be rax");
3630 assert_different_registers(klass, new_obj, t1, t2);
3631
3632 // get instance_size in InstanceKlass (scaled to a count of bytes)
3633 movl(layout_size, Address(klass, Klass::layout_helper_offset()));
3634 // test to see if it is malformed in some way
3635 testl(layout_size, Klass::_lh_instance_slow_path_bit);
3636 jcc(Assembler::notZero, slow_case_no_pop);
3637
3638 // Allocate the instance:
3639 // If TLAB is enabled:
3640 // Try to allocate in the TLAB.
3641 // If fails, go to the slow path.
3642 // Else If inline contiguous allocations are enabled:
3643 // Try to allocate in eden.
3644 // If fails due to heap end, go to slow path.
3645 //
3646 // If TLAB is enabled OR inline contiguous is enabled:
3647 // Initialize the allocation.
3648 // Exit.
3649 //
3650 // Go to slow path.
3651
3652 push(klass);
3653 if (UseTLAB) {
3654 tlab_allocate(new_obj, layout_size, 0, klass, t2, slow_case);
3655 if (ZeroTLAB || (!clear_fields)) {
3656 // the fields have been already cleared
3657 jmp(initialize_header);
3658 } else {
3659 // initialize both the header and fields
3660 jmp(initialize_object);
3661 }
3662 } else {
3663 jmp(slow_case);
3664 }
3665
3666 // If UseTLAB is true, the object is created above and there is an initialize need.
3667 // Otherwise, skip and go to the slow path.
3668 if (UseTLAB) {
3669 if (clear_fields) {
3670 // The object is initialized before the header. If the object size is
3671 // zero, go directly to the header initialization.
3672 bind(initialize_object);
3673 if (UseCompactObjectHeaders) {
3674 assert(is_aligned(oopDesc::base_offset_in_bytes(), BytesPerLong), "oop base offset must be 8-byte-aligned");
3675 decrement(layout_size, oopDesc::base_offset_in_bytes());
3676 } else {
3677 decrement(layout_size, sizeof(oopDesc));
3678 }
3679 jcc(Assembler::zero, initialize_header);
3680
3681 // Initialize topmost object field, divide size by 8, check if odd and
3682 // test if zero.
3683 Register zero = klass;
3684 xorl(zero, zero); // use zero reg to clear memory (shorter code)
3685 shrl(layout_size, LogBytesPerLong); // divide by 2*oopSize and set carry flag if odd
3686
3687 #ifdef ASSERT
3688 // make sure instance_size was multiple of 8
3689 Label L;
3690 // Ignore partial flag stall after shrl() since it is debug VM
3691 jcc(Assembler::carryClear, L);
3692 stop("object size is not multiple of 2 - adjust this code");
3693 bind(L);
3694 // must be > 0, no extra check needed here
3695 #endif
3696
3697 // initialize remaining object fields: instance_size was a multiple of 8
3698 {
3699 Label loop;
3700 bind(loop);
3701 int header_size_bytes = oopDesc::header_size() * HeapWordSize;
3702 assert(is_aligned(header_size_bytes, BytesPerLong), "oop header size must be 8-byte-aligned");
3703 movptr(Address(new_obj, layout_size, Address::times_8, header_size_bytes - 1*oopSize), zero);
3704 decrement(layout_size);
3705 jcc(Assembler::notZero, loop);
3706 }
3707 } // clear_fields
3708
3709 // initialize object header only.
3710 bind(initialize_header);
3711 if (UseCompactObjectHeaders || Arguments::is_valhalla_enabled()) {
3712 pop(klass);
3713 Register mark_word = t2;
3714 movptr(mark_word, Address(klass, Klass::prototype_header_offset()));
3715 movptr(Address(new_obj, oopDesc::mark_offset_in_bytes ()), mark_word);
3716 } else {
3717 movptr(Address(new_obj, oopDesc::mark_offset_in_bytes()),
3718 (intptr_t)markWord::prototype().value()); // header
3719 pop(klass); // get saved klass back in the register.
3720 }
3721 if (!UseCompactObjectHeaders) {
3722 xorl(rsi, rsi); // use zero reg to clear memory (shorter code)
3723 store_klass_gap(new_obj, rsi); // zero klass gap for compressed oops
3724 movptr(t2, klass); // preserve klass
3725 store_klass(new_obj, t2, rscratch1); // src klass reg is potentially compressed
3726 }
3727 jmp(done);
3728 }
3729
3730 bind(slow_case);
3731 pop(klass);
3732 bind(slow_case_no_pop);
3733 jmp(alloc_failed);
3734
3735 bind(done);
3736 }
3737
3738 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3739 void MacroAssembler::tlab_allocate(Register obj,
3740 Register var_size_in_bytes,
3741 int con_size_in_bytes,
3742 Register t1,
3743 Register t2,
3744 Label& slow_case) {
3745 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3746 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3747 }
3748
3749 RegSet MacroAssembler::call_clobbered_gp_registers() {
3750 RegSet regs;
3751 regs += RegSet::of(rax, rcx, rdx);
3752 #ifndef _WINDOWS
3753 regs += RegSet::of(rsi, rdi);
3754 #endif
3755 regs += RegSet::range(r8, r11);
3756 if (UseAPX) {
3757 regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1));
3758 }
3759 return regs;
3760 }
3761
3762 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
3763 int num_xmm_registers = XMMRegister::available_xmm_registers();
3764 #if defined(_WINDOWS)
3765 XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
3766 if (num_xmm_registers > 16) {
3767 result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
3768 }
3769 return result;
3770 #else
3771 return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
3772 #endif
3773 }
3774
3775 // C1 only ever uses the first double/float of the XMM register.
3776 static int xmm_save_size() { return sizeof(double); }
3777
3778 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3779 masm->movdbl(Address(rsp, offset), reg);
3780 }
3781
3782 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3783 masm->movdbl(reg, Address(rsp, offset));
3784 }
3785
3786 static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers,
3787 bool save_fpu, int& gp_area_size, int& xmm_area_size) {
3788
3789 gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
3790 StackAlignmentInBytes);
3791 xmm_area_size = save_fpu ? xmm_registers.size() * xmm_save_size() : 0;
3792
3793 return gp_area_size + xmm_area_size;
3794 }
3795
3796 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
3797 block_comment("push_call_clobbered_registers start");
3798 // Regular registers
3799 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
3800
3801 int gp_area_size;
3802 int xmm_area_size;
3803 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
3804 gp_area_size, xmm_area_size);
3805 subptr(rsp, total_save_size);
3806
3807 push_set(gp_registers_to_push, 0);
3808
3809 if (save_fpu) {
3810 push_set(call_clobbered_xmm_registers(), gp_area_size);
3811 }
3812
3813 block_comment("push_call_clobbered_registers end");
3814 }
3815
3816 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
3817 block_comment("pop_call_clobbered_registers start");
3818
3819 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
3820
3821 int gp_area_size;
3822 int xmm_area_size;
3823 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
3824 gp_area_size, xmm_area_size);
3825
3826 if (restore_fpu) {
3827 pop_set(call_clobbered_xmm_registers(), gp_area_size);
3828 }
3829
3830 pop_set(gp_registers_to_pop, 0);
3831
3832 addptr(rsp, total_save_size);
3833
3834 vzeroupper();
3835
3836 block_comment("pop_call_clobbered_registers end");
3837 }
3838
3839 void MacroAssembler::push_set(XMMRegSet set, int offset) {
3840 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
3841 int spill_offset = offset;
3842
3843 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
3844 save_xmm_register(this, spill_offset, *it);
3845 spill_offset += xmm_save_size();
3846 }
3847 }
3848
3849 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
3850 int restore_size = set.size() * xmm_save_size();
3851 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
3852
3853 int restore_offset = offset + restore_size - xmm_save_size();
3854
3855 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
3856 restore_xmm_register(this, restore_offset, *it);
3857 restore_offset -= xmm_save_size();
3858 }
3859 }
3860
3861 void MacroAssembler::push_set(RegSet set, int offset) {
3862 int spill_offset;
3863 if (offset == -1) {
3864 int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3865 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
3866 subptr(rsp, aligned_size);
3867 spill_offset = 0;
3868 } else {
3869 spill_offset = offset;
3870 }
3871
3872 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
3873 movptr(Address(rsp, spill_offset), *it);
3874 spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3875 }
3876 }
3877
3878 void MacroAssembler::pop_set(RegSet set, int offset) {
3879
3880 int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3881 int restore_size = set.size() * gp_reg_size;
3882 int aligned_size = align_up(restore_size, StackAlignmentInBytes);
3883
3884 int restore_offset;
3885 if (offset == -1) {
3886 restore_offset = restore_size - gp_reg_size;
3887 } else {
3888 restore_offset = offset + restore_size - gp_reg_size;
3889 }
3890 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
3891 movptr(*it, Address(rsp, restore_offset));
3892 restore_offset -= gp_reg_size;
3893 }
3894
3895 if (offset == -1) {
3896 addptr(rsp, aligned_size);
3897 }
3898 }
3899
3900 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3901 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3902 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3903 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3904 Label done;
3905
3906 testptr(length_in_bytes, length_in_bytes);
3907 jcc(Assembler::zero, done);
3908
3909 // initialize topmost word, divide index by 2, check if odd and test if zero
3910 // note: for the remaining code to work, index must be a multiple of BytesPerWord
3911 #ifdef ASSERT
3912 {
3913 Label L;
3914 testptr(length_in_bytes, BytesPerWord - 1);
3915 jcc(Assembler::zero, L);
3916 stop("length must be a multiple of BytesPerWord");
3917 bind(L);
3918 }
3919 #endif
3920 Register index = length_in_bytes;
3921 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
3922 if (UseIncDec) {
3923 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
3924 } else {
3925 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
3926 shrptr(index, 1);
3927 }
3928
3929 // initialize remaining object fields: index is a multiple of 2 now
3930 {
3931 Label loop;
3932 bind(loop);
3933 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3934 decrement(index);
3935 jcc(Assembler::notZero, loop);
3936 }
3937
3938 bind(done);
3939 }
3940
3941 void MacroAssembler::inline_layout_info(Register holder_klass, Register index, Register layout_info) {
3942 movptr(layout_info, Address(holder_klass, InstanceKlass::inline_layout_info_array_offset()));
3943 #ifdef ASSERT
3944 {
3945 Label done;
3946 cmpptr(layout_info, 0);
3947 jcc(Assembler::notEqual, done);
3948 stop("inline_layout_info_array is null");
3949 bind(done);
3950 }
3951 #endif
3952
3953 InlineLayoutInfo array[2];
3954 int size = (char*)&array[1] - (char*)&array[0]; // computing size of array elements
3955 if (is_power_of_2(size)) {
3956 shll(index, log2i_exact(size)); // Scale index by power of 2
3957 } else {
3958 imull(index, index, size); // Scale the index to be the entry index * array_element_size
3959 }
3960 lea(layout_info, Address(layout_info, index, Address::times_1, Array<InlineLayoutInfo>::base_offset_in_bytes()));
3961 }
3962
3963 // Look up the method for a megamorphic invokeinterface call.
3964 // The target method is determined by <intf_klass, itable_index>.
3965 // The receiver klass is in recv_klass.
3966 // On success, the result will be in method_result, and execution falls through.
3967 // On failure, execution transfers to the given label.
3968 void MacroAssembler::lookup_interface_method(Register recv_klass,
3969 Register intf_klass,
3970 RegisterOrConstant itable_index,
3971 Register method_result,
3972 Register scan_temp,
3973 Label& L_no_such_interface,
3974 bool return_method) {
3975 assert_different_registers(recv_klass, intf_klass, scan_temp);
3976 assert_different_registers(method_result, intf_klass, scan_temp);
3977 assert(recv_klass != method_result || !return_method,
3978 "recv_klass can be destroyed when method isn't needed");
3979
3980 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3981 "caller must use same register for non-constant itable index as for method");
3982
3983 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3984 int vtable_base = in_bytes(Klass::vtable_start_offset());
3985 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3986 int scan_step = itableOffsetEntry::size() * wordSize;
3987 int vte_size = vtableEntry::size_in_bytes();
3988 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3989 assert(vte_size == wordSize, "else adjust times_vte_scale");
3990
3991 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3992
3993 // Could store the aligned, prescaled offset in the klass.
3994 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3995
3996 if (return_method) {
3997 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3998 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3999 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4000 }
4001
4002 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
4003 // if (scan->interface() == intf) {
4004 // result = (klass + scan->offset() + itable_index);
4005 // }
4006 // }
4007 Label search, found_method;
4008
4009 for (int peel = 1; peel >= 0; peel--) {
4010 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
4011 cmpptr(intf_klass, method_result);
4012
4013 if (peel) {
4014 jccb(Assembler::equal, found_method);
4015 } else {
4016 jccb(Assembler::notEqual, search);
4017 // (invert the test to fall through to found_method...)
4018 }
4019
4020 if (!peel) break;
4021
4022 bind(search);
4023
4024 // Check that the previous entry is non-null. A null entry means that
4025 // the receiver class doesn't implement the interface, and wasn't the
4026 // same as when the caller was compiled.
4027 testptr(method_result, method_result);
4028 jcc(Assembler::zero, L_no_such_interface);
4029 addptr(scan_temp, scan_step);
4030 }
4031
4032 bind(found_method);
4033
4034 if (return_method) {
4035 // Got a hit.
4036 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
4037 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4038 }
4039 }
4040
4041 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
4042 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
4043 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
4044 // The target method is determined by <holder_klass, itable_index>.
4045 // The receiver klass is in recv_klass.
4046 // On success, the result will be in method_result, and execution falls through.
4047 // On failure, execution transfers to the given label.
4048 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
4049 Register holder_klass,
4050 Register resolved_klass,
4051 Register method_result,
4052 Register scan_temp,
4053 Register temp_reg2,
4054 Register receiver,
4055 int itable_index,
4056 Label& L_no_such_interface) {
4057 assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
4058 Register temp_itbl_klass = method_result;
4059 Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
4060
4061 int vtable_base = in_bytes(Klass::vtable_start_offset());
4062 int itentry_off = in_bytes(itableMethodEntry::method_offset());
4063 int scan_step = itableOffsetEntry::size() * wordSize;
4064 int vte_size = vtableEntry::size_in_bytes();
4065 int ioffset = in_bytes(itableOffsetEntry::interface_offset());
4066 int ooffset = in_bytes(itableOffsetEntry::offset_offset());
4067 Address::ScaleFactor times_vte_scale = Address::times_ptr;
4068 assert(vte_size == wordSize, "adjust times_vte_scale");
4069
4070 Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
4071
4072 // temp_itbl_klass = recv_klass.itable[0]
4073 // scan_temp = &recv_klass.itable[0] + step
4074 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4075 movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
4076 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
4077 xorptr(temp_reg, temp_reg);
4078
4079 // Initial checks:
4080 // - if (holder_klass != resolved_klass), go to "scan for resolved"
4081 // - if (itable[0] == 0), no such interface
4082 // - if (itable[0] == holder_klass), shortcut to "holder found"
4083 cmpptr(holder_klass, resolved_klass);
4084 jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
4085 testptr(temp_itbl_klass, temp_itbl_klass);
4086 jccb(Assembler::zero, L_no_such_interface);
4087 cmpptr(holder_klass, temp_itbl_klass);
4088 jccb(Assembler::equal, L_holder_found);
4089
4090 // Loop: Look for holder_klass record in itable
4091 // do {
4092 // tmp = itable[index];
4093 // index += step;
4094 // if (tmp == holder_klass) {
4095 // goto L_holder_found; // Found!
4096 // }
4097 // } while (tmp != 0);
4098 // goto L_no_such_interface // Not found.
4099 Label L_scan_holder;
4100 bind(L_scan_holder);
4101 movptr(temp_itbl_klass, Address(scan_temp, 0));
4102 addptr(scan_temp, scan_step);
4103 cmpptr(holder_klass, temp_itbl_klass);
4104 jccb(Assembler::equal, L_holder_found);
4105 testptr(temp_itbl_klass, temp_itbl_klass);
4106 jccb(Assembler::notZero, L_scan_holder);
4107
4108 jmpb(L_no_such_interface);
4109
4110 // Loop: Look for resolved_class record in itable
4111 // do {
4112 // tmp = itable[index];
4113 // index += step;
4114 // if (tmp == holder_klass) {
4115 // // Also check if we have met a holder klass
4116 // holder_tmp = itable[index-step-ioffset];
4117 // }
4118 // if (tmp == resolved_klass) {
4119 // goto L_resolved_found; // Found!
4120 // }
4121 // } while (tmp != 0);
4122 // goto L_no_such_interface // Not found.
4123 //
4124 Label L_loop_scan_resolved;
4125 bind(L_loop_scan_resolved);
4126 movptr(temp_itbl_klass, Address(scan_temp, 0));
4127 addptr(scan_temp, scan_step);
4128 bind(L_loop_scan_resolved_entry);
4129 cmpptr(holder_klass, temp_itbl_klass);
4130 cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4131 cmpptr(resolved_klass, temp_itbl_klass);
4132 jccb(Assembler::equal, L_resolved_found);
4133 testptr(temp_itbl_klass, temp_itbl_klass);
4134 jccb(Assembler::notZero, L_loop_scan_resolved);
4135
4136 jmpb(L_no_such_interface);
4137
4138 Label L_ready;
4139
4140 // See if we already have a holder klass. If not, go and scan for it.
4141 bind(L_resolved_found);
4142 testptr(temp_reg, temp_reg);
4143 jccb(Assembler::zero, L_scan_holder);
4144 jmpb(L_ready);
4145
4146 bind(L_holder_found);
4147 movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4148
4149 // Finally, temp_reg contains holder_klass vtable offset
4150 bind(L_ready);
4151 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4152 if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
4153 load_klass(scan_temp, receiver, noreg);
4154 movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4155 } else {
4156 movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4157 }
4158 }
4159
4160
4161 // virtual method calling
4162 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4163 RegisterOrConstant vtable_index,
4164 Register method_result) {
4165 const ByteSize base = Klass::vtable_start_offset();
4166 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4167 Address vtable_entry_addr(recv_klass,
4168 vtable_index, Address::times_ptr,
4169 base + vtableEntry::method_offset());
4170 movptr(method_result, vtable_entry_addr);
4171 }
4172
4173
4174 void MacroAssembler::check_klass_subtype(Register sub_klass,
4175 Register super_klass,
4176 Register temp_reg,
4177 Label& L_success) {
4178 Label L_failure;
4179 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, nullptr);
4180 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
4181 bind(L_failure);
4182 }
4183
4184
4185 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4186 Register super_klass,
4187 Register temp_reg,
4188 Label* L_success,
4189 Label* L_failure,
4190 Label* L_slow_path,
4191 RegisterOrConstant super_check_offset) {
4192 assert_different_registers(sub_klass, super_klass, temp_reg);
4193 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4194 if (super_check_offset.is_register()) {
4195 assert_different_registers(sub_klass, super_klass,
4196 super_check_offset.as_register());
4197 } else if (must_load_sco) {
4198 assert(temp_reg != noreg, "supply either a temp or a register offset");
4199 }
4200
4201 Label L_fallthrough;
4202 int label_nulls = 0;
4203 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4204 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4205 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4206 assert(label_nulls <= 1, "at most one null in the batch");
4207
4208 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4209 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4210 Address super_check_offset_addr(super_klass, sco_offset);
4211
4212 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4213 // range of a jccb. If this routine grows larger, reconsider at
4214 // least some of these.
4215 #define local_jcc(assembler_cond, label) \
4216 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
4217 else jcc( assembler_cond, label) /*omit semi*/
4218
4219 // Hacked jmp, which may only be used just before L_fallthrough.
4220 #define final_jmp(label) \
4221 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4222 else jmp(label) /*omit semi*/
4223
4224 // If the pointers are equal, we are done (e.g., String[] elements).
4225 // This self-check enables sharing of secondary supertype arrays among
4226 // non-primary types such as array-of-interface. Otherwise, each such
4227 // type would need its own customized SSA.
4228 // We move this check to the front of the fast path because many
4229 // type checks are in fact trivially successful in this manner,
4230 // so we get a nicely predicted branch right at the start of the check.
4231 cmpptr(sub_klass, super_klass);
4232 local_jcc(Assembler::equal, *L_success);
4233
4234 // Check the supertype display:
4235 if (must_load_sco) {
4236 // Positive movl does right thing on LP64.
4237 movl(temp_reg, super_check_offset_addr);
4238 super_check_offset = RegisterOrConstant(temp_reg);
4239 }
4240 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4241 cmpptr(super_klass, super_check_addr); // load displayed supertype
4242
4243 // This check has worked decisively for primary supers.
4244 // Secondary supers are sought in the super_cache ('super_cache_addr').
4245 // (Secondary supers are interfaces and very deeply nested subtypes.)
4246 // This works in the same check above because of a tricky aliasing
4247 // between the super_cache and the primary super display elements.
4248 // (The 'super_check_addr' can address either, as the case requires.)
4249 // Note that the cache is updated below if it does not help us find
4250 // what we need immediately.
4251 // So if it was a primary super, we can just fail immediately.
4252 // Otherwise, it's the slow path for us (no success at this point).
4253
4254 if (super_check_offset.is_register()) {
4255 local_jcc(Assembler::equal, *L_success);
4256 cmpl(super_check_offset.as_register(), sc_offset);
4257 if (L_failure == &L_fallthrough) {
4258 local_jcc(Assembler::equal, *L_slow_path);
4259 } else {
4260 local_jcc(Assembler::notEqual, *L_failure);
4261 final_jmp(*L_slow_path);
4262 }
4263 } else if (super_check_offset.as_constant() == sc_offset) {
4264 // Need a slow path; fast failure is impossible.
4265 if (L_slow_path == &L_fallthrough) {
4266 local_jcc(Assembler::equal, *L_success);
4267 } else {
4268 local_jcc(Assembler::notEqual, *L_slow_path);
4269 final_jmp(*L_success);
4270 }
4271 } else {
4272 // No slow path; it's a fast decision.
4273 if (L_failure == &L_fallthrough) {
4274 local_jcc(Assembler::equal, *L_success);
4275 } else {
4276 local_jcc(Assembler::notEqual, *L_failure);
4277 final_jmp(*L_success);
4278 }
4279 }
4280
4281 bind(L_fallthrough);
4282
4283 #undef local_jcc
4284 #undef final_jmp
4285 }
4286
4287
4288 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4289 Register super_klass,
4290 Register temp_reg,
4291 Register temp2_reg,
4292 Label* L_success,
4293 Label* L_failure,
4294 bool set_cond_codes) {
4295 assert_different_registers(sub_klass, super_klass, temp_reg);
4296 if (temp2_reg != noreg)
4297 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4298 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4299
4300 Label L_fallthrough;
4301 int label_nulls = 0;
4302 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4303 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4304 assert(label_nulls <= 1, "at most one null in the batch");
4305
4306 // a couple of useful fields in sub_klass:
4307 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4308 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4309 Address secondary_supers_addr(sub_klass, ss_offset);
4310 Address super_cache_addr( sub_klass, sc_offset);
4311
4312 // Do a linear scan of the secondary super-klass chain.
4313 // This code is rarely used, so simplicity is a virtue here.
4314 // The repne_scan instruction uses fixed registers, which we must spill.
4315 // Don't worry too much about pre-existing connections with the input regs.
4316
4317 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4318 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4319
4320 // Get super_klass value into rax (even if it was in rdi or rcx).
4321 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4322 if (super_klass != rax) {
4323 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4324 mov(rax, super_klass);
4325 }
4326 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4327 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4328
4329 #ifndef PRODUCT
4330 uint* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4331 ExternalAddress pst_counter_addr((address) pst_counter);
4332 lea(rcx, pst_counter_addr);
4333 incrementl(Address(rcx, 0));
4334 #endif //PRODUCT
4335
4336 // We will consult the secondary-super array.
4337 movptr(rdi, secondary_supers_addr);
4338 // Load the array length. (Positive movl does right thing on LP64.)
4339 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4340 // Skip to start of data.
4341 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4342
4343 // Scan RCX words at [RDI] for an occurrence of RAX.
4344 // Set NZ/Z based on last compare.
4345 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4346 // not change flags (only scas instruction which is repeated sets flags).
4347 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4348
4349 testptr(rax,rax); // Set Z = 0
4350 repne_scan();
4351
4352 // Unspill the temp. registers:
4353 if (pushed_rdi) pop(rdi);
4354 if (pushed_rcx) pop(rcx);
4355 if (pushed_rax) pop(rax);
4356
4357 if (set_cond_codes) {
4358 // Special hack for the AD files: rdi is guaranteed non-zero.
4359 assert(!pushed_rdi, "rdi must be left non-null");
4360 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4361 }
4362
4363 if (L_failure == &L_fallthrough)
4364 jccb(Assembler::notEqual, *L_failure);
4365 else jcc(Assembler::notEqual, *L_failure);
4366
4367 // Success. Cache the super we found and proceed in triumph.
4368 movptr(super_cache_addr, super_klass);
4369
4370 if (L_success != &L_fallthrough) {
4371 jmp(*L_success);
4372 }
4373
4374 #undef IS_A_TEMP
4375
4376 bind(L_fallthrough);
4377 }
4378
4379 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4380 Register super_klass,
4381 Register temp_reg,
4382 Register temp2_reg,
4383 Label* L_success,
4384 Label* L_failure,
4385 bool set_cond_codes) {
4386 assert(set_cond_codes == false, "must be false on 64-bit x86");
4387 check_klass_subtype_slow_path
4388 (sub_klass, super_klass, temp_reg, temp2_reg, noreg, noreg,
4389 L_success, L_failure);
4390 }
4391
4392 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4393 Register super_klass,
4394 Register temp_reg,
4395 Register temp2_reg,
4396 Register temp3_reg,
4397 Register temp4_reg,
4398 Label* L_success,
4399 Label* L_failure) {
4400 if (UseSecondarySupersTable) {
4401 check_klass_subtype_slow_path_table
4402 (sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, temp4_reg,
4403 L_success, L_failure);
4404 } else {
4405 check_klass_subtype_slow_path_linear
4406 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, /*set_cond_codes*/false);
4407 }
4408 }
4409
4410 Register MacroAssembler::allocate_if_noreg(Register r,
4411 RegSetIterator<Register> &available_regs,
4412 RegSet ®s_to_push) {
4413 if (!r->is_valid()) {
4414 r = *available_regs++;
4415 regs_to_push += r;
4416 }
4417 return r;
4418 }
4419
4420 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4421 Register super_klass,
4422 Register temp_reg,
4423 Register temp2_reg,
4424 Register temp3_reg,
4425 Register result_reg,
4426 Label* L_success,
4427 Label* L_failure) {
4428 // NB! Callers may assume that, when temp2_reg is a valid register,
4429 // this code sets it to a nonzero value.
4430 bool temp2_reg_was_valid = temp2_reg->is_valid();
4431
4432 RegSet temps = RegSet::of(temp_reg, temp2_reg, temp3_reg);
4433
4434 Label L_fallthrough;
4435 int label_nulls = 0;
4436 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4437 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4438 assert(label_nulls <= 1, "at most one null in the batch");
4439
4440 BLOCK_COMMENT("check_klass_subtype_slow_path_table");
4441
4442 RegSetIterator<Register> available_regs
4443 = (RegSet::of(rax, rcx, rdx, r8) + r9 + r10 + r11 + r12 - temps - sub_klass - super_klass).begin();
4444
4445 RegSet pushed_regs;
4446
4447 temp_reg = allocate_if_noreg(temp_reg, available_regs, pushed_regs);
4448 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
4449 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
4450 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4451 Register temp4_reg = allocate_if_noreg(noreg, available_regs, pushed_regs);
4452
4453 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, result_reg);
4454
4455 {
4456
4457 int register_push_size = pushed_regs.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4458 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4459 subptr(rsp, aligned_size);
4460 push_set(pushed_regs, 0);
4461
4462 lookup_secondary_supers_table_var(sub_klass,
4463 super_klass,
4464 temp_reg, temp2_reg, temp3_reg, temp4_reg, result_reg);
4465 cmpq(result_reg, 0);
4466
4467 // Unspill the temp. registers:
4468 pop_set(pushed_regs, 0);
4469 // Increment SP but do not clobber flags.
4470 lea(rsp, Address(rsp, aligned_size));
4471 }
4472
4473 if (temp2_reg_was_valid) {
4474 movq(temp2_reg, 1);
4475 }
4476
4477 jcc(Assembler::notEqual, *L_failure);
4478
4479 if (L_success != &L_fallthrough) {
4480 jmp(*L_success);
4481 }
4482
4483 bind(L_fallthrough);
4484 }
4485
4486 // population_count variant for running without the POPCNT
4487 // instruction, which was introduced with SSE4.2 in 2008.
4488 void MacroAssembler::population_count(Register dst, Register src,
4489 Register scratch1, Register scratch2) {
4490 assert_different_registers(src, scratch1, scratch2);
4491 if (UsePopCountInstruction) {
4492 Assembler::popcntq(dst, src);
4493 } else {
4494 assert_different_registers(src, scratch1, scratch2);
4495 assert_different_registers(dst, scratch1, scratch2);
4496 Label loop, done;
4497
4498 mov(scratch1, src);
4499 // dst = 0;
4500 // while(scratch1 != 0) {
4501 // dst++;
4502 // scratch1 &= (scratch1 - 1);
4503 // }
4504 xorl(dst, dst);
4505 testq(scratch1, scratch1);
4506 jccb(Assembler::equal, done);
4507 {
4508 bind(loop);
4509 incq(dst);
4510 movq(scratch2, scratch1);
4511 decq(scratch2);
4512 andq(scratch1, scratch2);
4513 jccb(Assembler::notEqual, loop);
4514 }
4515 bind(done);
4516 }
4517 #ifdef ASSERT
4518 mov64(scratch1, 0xCafeBabeDeadBeef);
4519 movq(scratch2, scratch1);
4520 #endif
4521 }
4522
4523 // Ensure that the inline code and the stub are using the same registers.
4524 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
4525 do { \
4526 assert(r_super_klass == rax, "mismatch"); \
4527 assert(r_array_base == rbx, "mismatch"); \
4528 assert(r_array_length == rcx, "mismatch"); \
4529 assert(r_array_index == rdx, "mismatch"); \
4530 assert(r_sub_klass == rsi || r_sub_klass == noreg, "mismatch"); \
4531 assert(r_bitmap == r11 || r_bitmap == noreg, "mismatch"); \
4532 assert(result == rdi || result == noreg, "mismatch"); \
4533 } while(0)
4534
4535 // Versions of salq and rorq that don't need count to be in rcx
4536
4537 void MacroAssembler::salq(Register dest, Register count) {
4538 if (count == rcx) {
4539 Assembler::salq(dest);
4540 } else {
4541 assert_different_registers(rcx, dest);
4542 xchgq(rcx, count);
4543 Assembler::salq(dest);
4544 xchgq(rcx, count);
4545 }
4546 }
4547
4548 void MacroAssembler::rorq(Register dest, Register count) {
4549 if (count == rcx) {
4550 Assembler::rorq(dest);
4551 } else {
4552 assert_different_registers(rcx, dest);
4553 xchgq(rcx, count);
4554 Assembler::rorq(dest);
4555 xchgq(rcx, count);
4556 }
4557 }
4558
4559 // Return true: we succeeded in generating this code
4560 //
4561 // At runtime, return 0 in result if r_super_klass is a superclass of
4562 // r_sub_klass, otherwise return nonzero. Use this if you know the
4563 // super_klass_slot of the class you're looking for. This is always
4564 // the case for instanceof and checkcast.
4565 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4566 Register r_super_klass,
4567 Register temp1,
4568 Register temp2,
4569 Register temp3,
4570 Register temp4,
4571 Register result,
4572 u1 super_klass_slot) {
4573 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4574
4575 Label L_fallthrough, L_success, L_failure;
4576
4577 BLOCK_COMMENT("lookup_secondary_supers_table {");
4578
4579 const Register
4580 r_array_index = temp1,
4581 r_array_length = temp2,
4582 r_array_base = temp3,
4583 r_bitmap = temp4;
4584
4585 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4586
4587 xorq(result, result); // = 0
4588
4589 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4590 movq(r_array_index, r_bitmap);
4591
4592 // First check the bitmap to see if super_klass might be present. If
4593 // the bit is zero, we are certain that super_klass is not one of
4594 // the secondary supers.
4595 u1 bit = super_klass_slot;
4596 {
4597 // NB: If the count in a x86 shift instruction is 0, the flags are
4598 // not affected, so we do a testq instead.
4599 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
4600 if (shift_count != 0) {
4601 salq(r_array_index, shift_count);
4602 } else {
4603 testq(r_array_index, r_array_index);
4604 }
4605 }
4606 // We test the MSB of r_array_index, i.e. its sign bit
4607 jcc(Assembler::positive, L_failure);
4608
4609 // Get the first array index that can contain super_klass into r_array_index.
4610 if (bit != 0) {
4611 population_count(r_array_index, r_array_index, temp2, temp3);
4612 } else {
4613 movl(r_array_index, 1);
4614 }
4615 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4616
4617 // We will consult the secondary-super array.
4618 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4619
4620 // We're asserting that the first word in an Array<Klass*> is the
4621 // length, and the second word is the first word of the data. If
4622 // that ever changes, r_array_base will have to be adjusted here.
4623 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4624 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4625
4626 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4627 jccb(Assembler::equal, L_success);
4628
4629 // Is there another entry to check? Consult the bitmap.
4630 btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4631 jccb(Assembler::carryClear, L_failure);
4632
4633 // Linear probe. Rotate the bitmap so that the next bit to test is
4634 // in Bit 1.
4635 if (bit != 0) {
4636 rorq(r_bitmap, bit);
4637 }
4638
4639 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4640 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4641 // Kills: r_array_length.
4642 // Returns: result.
4643 call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub()));
4644 // Result (0/1) is in rdi
4645 jmpb(L_fallthrough);
4646
4647 bind(L_failure);
4648 incq(result); // 0 => 1
4649
4650 bind(L_success);
4651 // result = 0;
4652
4653 bind(L_fallthrough);
4654 BLOCK_COMMENT("} lookup_secondary_supers_table");
4655
4656 if (VerifySecondarySupers) {
4657 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4658 temp1, temp2, temp3);
4659 }
4660 }
4661
4662 // At runtime, return 0 in result if r_super_klass is a superclass of
4663 // r_sub_klass, otherwise return nonzero. Use this version of
4664 // lookup_secondary_supers_table() if you don't know ahead of time
4665 // which superclass will be searched for. Used by interpreter and
4666 // runtime stubs. It is larger and has somewhat greater latency than
4667 // the version above, which takes a constant super_klass_slot.
4668 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4669 Register r_super_klass,
4670 Register temp1,
4671 Register temp2,
4672 Register temp3,
4673 Register temp4,
4674 Register result) {
4675 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4676 assert_different_registers(r_sub_klass, r_super_klass, rcx);
4677 RegSet temps = RegSet::of(temp1, temp2, temp3, temp4);
4678
4679 Label L_fallthrough, L_success, L_failure;
4680
4681 BLOCK_COMMENT("lookup_secondary_supers_table {");
4682
4683 RegSetIterator<Register> available_regs = (temps - rcx).begin();
4684
4685 // FIXME. Once we are sure that all paths reaching this point really
4686 // do pass rcx as one of our temps we can get rid of the following
4687 // workaround.
4688 assert(temps.contains(rcx), "fix this code");
4689
4690 // We prefer to have our shift count in rcx. If rcx is one of our
4691 // temps, use it for slot. If not, pick any of our temps.
4692 Register slot;
4693 if (!temps.contains(rcx)) {
4694 slot = *available_regs++;
4695 } else {
4696 slot = rcx;
4697 }
4698
4699 const Register r_array_index = *available_regs++;
4700 const Register r_bitmap = *available_regs++;
4701
4702 // The logic above guarantees this property, but we state it here.
4703 assert_different_registers(r_array_index, r_bitmap, rcx);
4704
4705 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4706 movq(r_array_index, r_bitmap);
4707
4708 // First check the bitmap to see if super_klass might be present. If
4709 // the bit is zero, we are certain that super_klass is not one of
4710 // the secondary supers.
4711 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4712 xorl(slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1)); // slot ^ 63 === 63 - slot (mod 64)
4713 salq(r_array_index, slot);
4714
4715 testq(r_array_index, r_array_index);
4716 // We test the MSB of r_array_index, i.e. its sign bit
4717 jcc(Assembler::positive, L_failure);
4718
4719 const Register r_array_base = *available_regs++;
4720
4721 // Get the first array index that can contain super_klass into r_array_index.
4722 // Note: Clobbers r_array_base and slot.
4723 population_count(r_array_index, r_array_index, /*temp2*/r_array_base, /*temp3*/slot);
4724
4725 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4726
4727 // We will consult the secondary-super array.
4728 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4729
4730 // We're asserting that the first word in an Array<Klass*> is the
4731 // length, and the second word is the first word of the data. If
4732 // that ever changes, r_array_base will have to be adjusted here.
4733 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4734 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4735
4736 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4737 jccb(Assembler::equal, L_success);
4738
4739 // Restore slot to its true value
4740 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4741
4742 // Linear probe. Rotate the bitmap so that the next bit to test is
4743 // in Bit 1.
4744 rorq(r_bitmap, slot);
4745
4746 // Is there another entry to check? Consult the bitmap.
4747 btq(r_bitmap, 1);
4748 jccb(Assembler::carryClear, L_failure);
4749
4750 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4751 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4752 // Kills: r_array_length.
4753 // Returns: result.
4754 lookup_secondary_supers_table_slow_path(r_super_klass,
4755 r_array_base,
4756 r_array_index,
4757 r_bitmap,
4758 /*temp1*/result,
4759 /*temp2*/slot,
4760 &L_success,
4761 nullptr);
4762
4763 bind(L_failure);
4764 movq(result, 1);
4765 jmpb(L_fallthrough);
4766
4767 bind(L_success);
4768 xorq(result, result); // = 0
4769
4770 bind(L_fallthrough);
4771 BLOCK_COMMENT("} lookup_secondary_supers_table");
4772
4773 if (VerifySecondarySupers) {
4774 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4775 temp1, temp2, temp3);
4776 }
4777 }
4778
4779 void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit,
4780 Label* L_success, Label* L_failure) {
4781 Label L_loop, L_fallthrough;
4782 {
4783 int label_nulls = 0;
4784 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4785 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4786 assert(label_nulls <= 1, "at most one null in the batch");
4787 }
4788 bind(L_loop);
4789 cmpq(value, Address(addr, count, Address::times_8));
4790 jcc(Assembler::equal, *L_success);
4791 addl(count, 1);
4792 cmpl(count, limit);
4793 jcc(Assembler::less, L_loop);
4794
4795 if (&L_fallthrough != L_failure) {
4796 jmp(*L_failure);
4797 }
4798 bind(L_fallthrough);
4799 }
4800
4801 // Called by code generated by check_klass_subtype_slow_path
4802 // above. This is called when there is a collision in the hashed
4803 // lookup in the secondary supers array.
4804 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4805 Register r_array_base,
4806 Register r_array_index,
4807 Register r_bitmap,
4808 Register temp1,
4809 Register temp2,
4810 Label* L_success,
4811 Label* L_failure) {
4812 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2);
4813
4814 const Register
4815 r_array_length = temp1,
4816 r_sub_klass = noreg,
4817 result = noreg;
4818
4819 Label L_fallthrough;
4820 int label_nulls = 0;
4821 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4822 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4823 assert(label_nulls <= 1, "at most one null in the batch");
4824
4825 // Load the array length.
4826 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4827 // And adjust the array base to point to the data.
4828 // NB! Effectively increments current slot index by 1.
4829 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4830 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4831
4832 // Linear probe
4833 Label L_huge;
4834
4835 // The bitmap is full to bursting.
4836 // Implicit invariant: BITMAP_FULL implies (length > 0)
4837 cmpl(r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4838 jcc(Assembler::greater, L_huge);
4839
4840 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4841 // current slot (at secondary_supers[r_array_index]) has not yet
4842 // been inspected, and r_array_index may be out of bounds if we
4843 // wrapped around the end of the array.
4844
4845 { // This is conventional linear probing, but instead of terminating
4846 // when a null entry is found in the table, we maintain a bitmap
4847 // in which a 0 indicates missing entries.
4848 // The check above guarantees there are 0s in the bitmap, so the loop
4849 // eventually terminates.
4850
4851 xorl(temp2, temp2); // = 0;
4852
4853 Label L_again;
4854 bind(L_again);
4855
4856 // Check for array wraparound.
4857 cmpl(r_array_index, r_array_length);
4858 cmovl(Assembler::greaterEqual, r_array_index, temp2);
4859
4860 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4861 jcc(Assembler::equal, *L_success);
4862
4863 // If the next bit in bitmap is zero, we're done.
4864 btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now
4865 jcc(Assembler::carryClear, *L_failure);
4866
4867 rorq(r_bitmap, 1); // Bits 1/2 => 0/1
4868 addl(r_array_index, 1);
4869
4870 jmp(L_again);
4871 }
4872
4873 { // Degenerate case: more than 64 secondary supers.
4874 // FIXME: We could do something smarter here, maybe a vectorized
4875 // comparison or a binary search, but is that worth any added
4876 // complexity?
4877 bind(L_huge);
4878 xorl(r_array_index, r_array_index); // = 0
4879 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length,
4880 L_success,
4881 (&L_fallthrough != L_failure ? L_failure : nullptr));
4882
4883 bind(L_fallthrough);
4884 }
4885 }
4886
4887 struct VerifyHelperArguments {
4888 Klass* _super;
4889 Klass* _sub;
4890 intptr_t _linear_result;
4891 intptr_t _table_result;
4892 };
4893
4894 static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) {
4895 Klass::on_secondary_supers_verification_failure(args->_super,
4896 args->_sub,
4897 args->_linear_result,
4898 args->_table_result,
4899 msg);
4900 }
4901
4902 // Make sure that the hashed lookup and a linear scan agree.
4903 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4904 Register r_super_klass,
4905 Register result,
4906 Register temp1,
4907 Register temp2,
4908 Register temp3) {
4909 const Register
4910 r_array_index = temp1,
4911 r_array_length = temp2,
4912 r_array_base = temp3,
4913 r_bitmap = noreg;
4914
4915 BLOCK_COMMENT("verify_secondary_supers_table {");
4916
4917 Label L_success, L_failure, L_check, L_done;
4918
4919 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4920 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4921 // And adjust the array base to point to the data.
4922 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4923
4924 testl(r_array_length, r_array_length); // array_length == 0?
4925 jcc(Assembler::zero, L_failure);
4926
4927 movl(r_array_index, 0);
4928 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success);
4929 // fall through to L_failure
4930
4931 const Register linear_result = r_array_index; // reuse temp1
4932
4933 bind(L_failure); // not present
4934 movl(linear_result, 1);
4935 jmp(L_check);
4936
4937 bind(L_success); // present
4938 movl(linear_result, 0);
4939
4940 bind(L_check);
4941 cmpl(linear_result, result);
4942 jcc(Assembler::equal, L_done);
4943
4944 { // To avoid calling convention issues, build a record on the stack
4945 // and pass the pointer to that instead.
4946 push(result);
4947 push(linear_result);
4948 push(r_sub_klass);
4949 push(r_super_klass);
4950 movptr(c_rarg1, rsp);
4951 movptr(c_rarg0, (uintptr_t) "mismatch");
4952 call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper)));
4953 should_not_reach_here();
4954 }
4955 bind(L_done);
4956
4957 BLOCK_COMMENT("} verify_secondary_supers_table");
4958 }
4959
4960 #undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS
4961
4962 void MacroAssembler::clinit_barrier(Register klass, Label* L_fast_path, Label* L_slow_path) {
4963 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
4964
4965 Label L_fallthrough;
4966 if (L_fast_path == nullptr) {
4967 L_fast_path = &L_fallthrough;
4968 } else if (L_slow_path == nullptr) {
4969 L_slow_path = &L_fallthrough;
4970 }
4971
4972 // Fast path check: class is fully initialized.
4973 // init_state needs acquire, but x86 is TSO, and so we are already good.
4974 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4975 jcc(Assembler::equal, *L_fast_path);
4976
4977 // Fast path check: current thread is initializer thread
4978 cmpptr(r15_thread, Address(klass, InstanceKlass::init_thread_offset()));
4979 if (L_slow_path == &L_fallthrough) {
4980 jcc(Assembler::equal, *L_fast_path);
4981 bind(*L_slow_path);
4982 } else if (L_fast_path == &L_fallthrough) {
4983 jcc(Assembler::notEqual, *L_slow_path);
4984 bind(*L_fast_path);
4985 } else {
4986 Unimplemented();
4987 }
4988 }
4989
4990 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4991 if (VM_Version::supports_cmov()) {
4992 cmovl(cc, dst, src);
4993 } else {
4994 Label L;
4995 jccb(negate_condition(cc), L);
4996 movl(dst, src);
4997 bind(L);
4998 }
4999 }
5000
5001 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
5002 if (VM_Version::supports_cmov()) {
5003 cmovl(cc, dst, src);
5004 } else {
5005 Label L;
5006 jccb(negate_condition(cc), L);
5007 movl(dst, src);
5008 bind(L);
5009 }
5010 }
5011
5012 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
5013 if (!VerifyOops || VerifyAdapterSharing) {
5014 // Below address of the code string confuses VerifyAdapterSharing
5015 // because it may differ between otherwise equivalent adapters.
5016 return;
5017 }
5018
5019 BLOCK_COMMENT("verify_oop {");
5020 push(rscratch1);
5021 push(rax); // save rax
5022 push(reg); // pass register argument
5023
5024 // Pass register number to verify_oop_subroutine
5025 const char* b = nullptr;
5026 {
5027 ResourceMark rm;
5028 stringStream ss;
5029 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
5030 b = code_string(ss.as_string());
5031 }
5032 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5033 pushptr(buffer.addr(), rscratch1);
5034
5035 // call indirectly to solve generation ordering problem
5036 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5037 call(rax);
5038 // Caller pops the arguments (oop, message) and restores rax, r10
5039 BLOCK_COMMENT("} verify_oop");
5040 }
5041
5042 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
5043 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
5044 // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
5045 // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
5046 vpternlogd(dst, 0xFF, dst, dst, vector_len);
5047 } else if (VM_Version::supports_avx()) {
5048 vpcmpeqd(dst, dst, dst, vector_len);
5049 } else {
5050 pcmpeqd(dst, dst);
5051 }
5052 }
5053
5054 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
5055 int extra_slot_offset) {
5056 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
5057 int stackElementSize = Interpreter::stackElementSize;
5058 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
5059 #ifdef ASSERT
5060 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
5061 assert(offset1 - offset == stackElementSize, "correct arithmetic");
5062 #endif
5063 Register scale_reg = noreg;
5064 Address::ScaleFactor scale_factor = Address::no_scale;
5065 if (arg_slot.is_constant()) {
5066 offset += arg_slot.as_constant() * stackElementSize;
5067 } else {
5068 scale_reg = arg_slot.as_register();
5069 scale_factor = Address::times(stackElementSize);
5070 }
5071 offset += wordSize; // return PC is on stack
5072 return Address(rsp, scale_reg, scale_factor, offset);
5073 }
5074
5075 // Handle the receiver type profile update given the "recv" klass.
5076 //
5077 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
5078 // If there are no matching or claimable receiver entries in RD, updates
5079 // the polymorphic counter.
5080 //
5081 // This code expected to run by either the interpreter or JIT-ed code, without
5082 // extra synchronization. For safety, receiver cells are claimed atomically, which
5083 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
5084 // counter updates are not atomic.
5085 //
5086 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
5087 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
5088 int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
5089 int poly_count_offset = in_bytes(CounterData::count_offset());
5090 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
5091 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
5092
5093 // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
5094 assert(is_aligned(mdp_offset, BytesPerWord), "sanity");
5095 base_receiver_offset += mdp_offset;
5096 end_receiver_offset += mdp_offset;
5097 poly_count_offset += mdp_offset;
5098
5099 // Scale down to optimize encoding. Slots are pointer-sized.
5100 assert(is_aligned(base_receiver_offset, BytesPerWord), "sanity");
5101 assert(is_aligned(end_receiver_offset, BytesPerWord), "sanity");
5102 assert(is_aligned(poly_count_offset, BytesPerWord), "sanity");
5103 assert(is_aligned(receiver_step, BytesPerWord), "sanity");
5104 assert(is_aligned(receiver_to_count_step, BytesPerWord), "sanity");
5105 base_receiver_offset >>= LogBytesPerWord;
5106 end_receiver_offset >>= LogBytesPerWord;
5107 poly_count_offset >>= LogBytesPerWord;
5108 receiver_step >>= LogBytesPerWord;
5109 receiver_to_count_step >>= LogBytesPerWord;
5110
5111 #ifdef ASSERT
5112 // We are about to walk the MDO slots without asking for offsets.
5113 // Check that our math hits all the right spots.
5114 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
5115 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
5116 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
5117 int offset = base_receiver_offset + receiver_step*c;
5118 int count_offset = offset + receiver_to_count_step;
5119 assert((offset << LogBytesPerWord) == real_recv_offset, "receiver slot math");
5120 assert((count_offset << LogBytesPerWord) == real_count_offset, "receiver count math");
5121 }
5122 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
5123 assert(poly_count_offset << LogBytesPerWord == real_poly_count_offset, "poly counter math");
5124 #endif
5125
5126 // Corner case: no profile table. Increment poly counter and exit.
5127 if (ReceiverTypeData::row_limit() == 0) {
5128 addptr(Address(mdp, poly_count_offset, Address::times_ptr), DataLayout::counter_increment);
5129 return;
5130 }
5131
5132 Register offset = rscratch1;
5133
5134 Label L_loop_search_receiver, L_loop_search_empty;
5135 Label L_restart, L_found_recv, L_found_empty, L_polymorphic, L_count_update;
5136
5137 // The code here recognizes three major cases:
5138 // A. Fastest: receiver found in the table
5139 // B. Fast: no receiver in the table, and the table is full
5140 // C. Slow: no receiver in the table, free slots in the table
5141 //
5142 // The case A performance is most important, as perfectly-behaved code would end up
5143 // there, especially with larger TypeProfileWidth. The case B performance is
5144 // important as well, this is where bulk of code would land for normally megamorphic
5145 // cases. The case C performance is not essential, its job is to deal with installation
5146 // races, we optimize for code density instead. Case C needs to make sure that receiver
5147 // rows are only claimed once. This makes sure we never overwrite a row for another
5148 // receiver and never duplicate the receivers in the list, making profile type-accurate.
5149 //
5150 // It is very tempting to handle these cases in a single loop, and claim the first slot
5151 // without checking the rest of the table. But, profiling code should tolerate free slots
5152 // in the table, as class unloading can clear them. After such cleanup, the receiver
5153 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
5154 // to complete, before trying to install new slots. Splitting the code in several tight
5155 // loops also helpfully optimizes for cases A and B.
5156 //
5157 // This code is effectively:
5158 //
5159 // restart:
5160 // // Fastest: receiver is already installed
5161 // for (i = 0; i < receiver_count(); i++) {
5162 // if (receiver(i) == recv) goto found_recv(i);
5163 // }
5164 //
5165 // // Fast: no receiver, but profile is full
5166 // for (i = 0; i < receiver_count(); i++) {
5167 // if (receiver(i) == null) goto found_null(i);
5168 // }
5169 // goto polymorphic
5170 //
5171 // // Slow: try to install receiver
5172 // found_null(i):
5173 // CAS(&receiver(i), null, recv);
5174 // goto restart
5175 //
5176 // polymorphic:
5177 // count++;
5178 // return
5179 //
5180 // found_recv(i):
5181 // *receiver_count(i)++
5182 //
5183
5184 bind(L_restart);
5185
5186 // Fastest: receiver is already installed
5187 movptr(offset, base_receiver_offset);
5188 bind(L_loop_search_receiver);
5189 cmpptr(recv, Address(mdp, offset, Address::times_ptr));
5190 jccb(Assembler::equal, L_found_recv);
5191 addptr(offset, receiver_step);
5192 cmpptr(offset, end_receiver_offset);
5193 jccb(Assembler::notEqual, L_loop_search_receiver);
5194
5195 // Fast: no receiver, but profile is full
5196 movptr(offset, base_receiver_offset);
5197 bind(L_loop_search_empty);
5198 cmpptr(Address(mdp, offset, Address::times_ptr), NULL_WORD);
5199 jccb(Assembler::equal, L_found_empty);
5200 addptr(offset, receiver_step);
5201 cmpptr(offset, end_receiver_offset);
5202 jccb(Assembler::notEqual, L_loop_search_empty);
5203 jmpb(L_polymorphic);
5204
5205 // Slow: try to install receiver
5206 bind(L_found_empty);
5207
5208 // Atomically swing receiver slot: null -> recv.
5209 //
5210 // The update code uses CAS, which wants RAX register specifically, *and* it needs
5211 // other important registers untouched, as they form the address. Therefore, we need
5212 // to shift any important registers from RAX into some other spare register. If we
5213 // have a spare register, we are forced to save it on stack here.
5214
5215 Register spare_reg = noreg;
5216 Register shifted_mdp = mdp;
5217 Register shifted_recv = recv;
5218 if (recv == rax || mdp == rax) {
5219 spare_reg = (recv != rbx && mdp != rbx) ? rbx :
5220 (recv != rcx && mdp != rcx) ? rcx :
5221 rdx;
5222 assert_different_registers(mdp, recv, offset, spare_reg);
5223
5224 push(spare_reg);
5225 if (recv == rax) {
5226 movptr(spare_reg, recv);
5227 shifted_recv = spare_reg;
5228 } else {
5229 assert(mdp == rax, "Remaining case");
5230 movptr(spare_reg, mdp);
5231 shifted_mdp = spare_reg;
5232 }
5233 } else {
5234 push(rax);
5235 }
5236
5237 // None of the important registers are in RAX after this shuffle.
5238 assert_different_registers(rax, shifted_mdp, shifted_recv, offset);
5239
5240 xorptr(rax, rax);
5241 cmpxchgptr(shifted_recv, Address(shifted_mdp, offset, Address::times_ptr));
5242
5243 // Unshift registers.
5244 if (recv == rax || mdp == rax) {
5245 movptr(rax, spare_reg);
5246 pop(spare_reg);
5247 } else {
5248 pop(rax);
5249 }
5250
5251 // CAS success means the slot now has the receiver we want. CAS failure means
5252 // something had claimed the slot concurrently: it can be the same receiver we want,
5253 // or something else. Since this is a slow path, we can optimize for code density,
5254 // and just restart the search from the beginning.
5255 jmpb(L_restart);
5256
5257 // Counter updates:
5258
5259 // Increment polymorphic counter instead of receiver slot.
5260 bind(L_polymorphic);
5261 movptr(offset, poly_count_offset);
5262 jmpb(L_count_update);
5263
5264 // Found a receiver, convert its slot offset to corresponding count offset.
5265 bind(L_found_recv);
5266 addptr(offset, receiver_to_count_step);
5267
5268 bind(L_count_update);
5269 addptr(Address(mdp, offset, Address::times_ptr), DataLayout::counter_increment);
5270 }
5271
5272 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
5273 if (!VerifyOops || VerifyAdapterSharing) {
5274 // Below address of the code string confuses VerifyAdapterSharing
5275 // because it may differ between otherwise equivalent adapters.
5276 return;
5277 }
5278
5279 push(rscratch1);
5280 push(rax); // save rax,
5281 // addr may contain rsp so we will have to adjust it based on the push
5282 // we just did (and on 64 bit we do two pushes)
5283 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5284 // stores rax into addr which is backwards of what was intended.
5285 if (addr.uses(rsp)) {
5286 lea(rax, addr);
5287 pushptr(Address(rax, 2 * BytesPerWord));
5288 } else {
5289 pushptr(addr);
5290 }
5291
5292 // Pass register number to verify_oop_subroutine
5293 const char* b = nullptr;
5294 {
5295 ResourceMark rm;
5296 stringStream ss;
5297 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
5298 b = code_string(ss.as_string());
5299 }
5300 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5301 pushptr(buffer.addr(), rscratch1);
5302
5303 // call indirectly to solve generation ordering problem
5304 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5305 call(rax);
5306 // Caller pops the arguments (addr, message) and restores rax, r10.
5307 }
5308
5309 void MacroAssembler::verify_tlab() {
5310 #ifdef ASSERT
5311 if (UseTLAB && VerifyOops) {
5312 Label next, ok;
5313 Register t1 = rsi;
5314
5315 push(t1);
5316
5317 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5318 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_start_offset())));
5319 jcc(Assembler::aboveEqual, next);
5320 STOP("assert(top >= start)");
5321 should_not_reach_here();
5322
5323 bind(next);
5324 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_end_offset())));
5325 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5326 jcc(Assembler::aboveEqual, ok);
5327 STOP("assert(top <= end)");
5328 should_not_reach_here();
5329
5330 bind(ok);
5331 pop(t1);
5332 }
5333 #endif
5334 }
5335
5336 class ControlWord {
5337 public:
5338 int32_t _value;
5339
5340 int rounding_control() const { return (_value >> 10) & 3 ; }
5341 int precision_control() const { return (_value >> 8) & 3 ; }
5342 bool precision() const { return ((_value >> 5) & 1) != 0; }
5343 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5344 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5345 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5346 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5347 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5348
5349 void print() const {
5350 // rounding control
5351 const char* rc;
5352 switch (rounding_control()) {
5353 case 0: rc = "round near"; break;
5354 case 1: rc = "round down"; break;
5355 case 2: rc = "round up "; break;
5356 case 3: rc = "chop "; break;
5357 default:
5358 rc = nullptr; // silence compiler warnings
5359 fatal("Unknown rounding control: %d", rounding_control());
5360 };
5361 // precision control
5362 const char* pc;
5363 switch (precision_control()) {
5364 case 0: pc = "24 bits "; break;
5365 case 1: pc = "reserved"; break;
5366 case 2: pc = "53 bits "; break;
5367 case 3: pc = "64 bits "; break;
5368 default:
5369 pc = nullptr; // silence compiler warnings
5370 fatal("Unknown precision control: %d", precision_control());
5371 };
5372 // flags
5373 char f[9];
5374 f[0] = ' ';
5375 f[1] = ' ';
5376 f[2] = (precision ()) ? 'P' : 'p';
5377 f[3] = (underflow ()) ? 'U' : 'u';
5378 f[4] = (overflow ()) ? 'O' : 'o';
5379 f[5] = (zero_divide ()) ? 'Z' : 'z';
5380 f[6] = (denormalized()) ? 'D' : 'd';
5381 f[7] = (invalid ()) ? 'I' : 'i';
5382 f[8] = '\x0';
5383 // output
5384 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5385 }
5386
5387 };
5388
5389 class StatusWord {
5390 public:
5391 int32_t _value;
5392
5393 bool busy() const { return ((_value >> 15) & 1) != 0; }
5394 bool C3() const { return ((_value >> 14) & 1) != 0; }
5395 bool C2() const { return ((_value >> 10) & 1) != 0; }
5396 bool C1() const { return ((_value >> 9) & 1) != 0; }
5397 bool C0() const { return ((_value >> 8) & 1) != 0; }
5398 int top() const { return (_value >> 11) & 7 ; }
5399 bool error_status() const { return ((_value >> 7) & 1) != 0; }
5400 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
5401 bool precision() const { return ((_value >> 5) & 1) != 0; }
5402 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5403 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5404 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5405 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5406 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5407
5408 void print() const {
5409 // condition codes
5410 char c[5];
5411 c[0] = (C3()) ? '3' : '-';
5412 c[1] = (C2()) ? '2' : '-';
5413 c[2] = (C1()) ? '1' : '-';
5414 c[3] = (C0()) ? '0' : '-';
5415 c[4] = '\x0';
5416 // flags
5417 char f[9];
5418 f[0] = (error_status()) ? 'E' : '-';
5419 f[1] = (stack_fault ()) ? 'S' : '-';
5420 f[2] = (precision ()) ? 'P' : '-';
5421 f[3] = (underflow ()) ? 'U' : '-';
5422 f[4] = (overflow ()) ? 'O' : '-';
5423 f[5] = (zero_divide ()) ? 'Z' : '-';
5424 f[6] = (denormalized()) ? 'D' : '-';
5425 f[7] = (invalid ()) ? 'I' : '-';
5426 f[8] = '\x0';
5427 // output
5428 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
5429 }
5430
5431 };
5432
5433 class TagWord {
5434 public:
5435 int32_t _value;
5436
5437 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
5438
5439 void print() const {
5440 printf("%04x", _value & 0xFFFF);
5441 }
5442
5443 };
5444
5445 class FPU_Register {
5446 public:
5447 int32_t _m0;
5448 int32_t _m1;
5449 int16_t _ex;
5450
5451 bool is_indefinite() const {
5452 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5453 }
5454
5455 void print() const {
5456 char sign = (_ex < 0) ? '-' : '+';
5457 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
5458 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
5459 };
5460
5461 };
5462
5463 class FPU_State {
5464 public:
5465 enum {
5466 register_size = 10,
5467 number_of_registers = 8,
5468 register_mask = 7
5469 };
5470
5471 ControlWord _control_word;
5472 StatusWord _status_word;
5473 TagWord _tag_word;
5474 int32_t _error_offset;
5475 int32_t _error_selector;
5476 int32_t _data_offset;
5477 int32_t _data_selector;
5478 int8_t _register[register_size * number_of_registers];
5479
5480 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5481 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
5482
5483 const char* tag_as_string(int tag) const {
5484 switch (tag) {
5485 case 0: return "valid";
5486 case 1: return "zero";
5487 case 2: return "special";
5488 case 3: return "empty";
5489 }
5490 ShouldNotReachHere();
5491 return nullptr;
5492 }
5493
5494 void print() const {
5495 // print computation registers
5496 { int t = _status_word.top();
5497 for (int i = 0; i < number_of_registers; i++) {
5498 int j = (i - t) & register_mask;
5499 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5500 st(j)->print();
5501 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5502 }
5503 }
5504 printf("\n");
5505 // print control registers
5506 printf("ctrl = "); _control_word.print(); printf("\n");
5507 printf("stat = "); _status_word .print(); printf("\n");
5508 printf("tags = "); _tag_word .print(); printf("\n");
5509 }
5510
5511 };
5512
5513 class Flag_Register {
5514 public:
5515 int32_t _value;
5516
5517 bool overflow() const { return ((_value >> 11) & 1) != 0; }
5518 bool direction() const { return ((_value >> 10) & 1) != 0; }
5519 bool sign() const { return ((_value >> 7) & 1) != 0; }
5520 bool zero() const { return ((_value >> 6) & 1) != 0; }
5521 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
5522 bool parity() const { return ((_value >> 2) & 1) != 0; }
5523 bool carry() const { return ((_value >> 0) & 1) != 0; }
5524
5525 void print() const {
5526 // flags
5527 char f[8];
5528 f[0] = (overflow ()) ? 'O' : '-';
5529 f[1] = (direction ()) ? 'D' : '-';
5530 f[2] = (sign ()) ? 'S' : '-';
5531 f[3] = (zero ()) ? 'Z' : '-';
5532 f[4] = (auxiliary_carry()) ? 'A' : '-';
5533 f[5] = (parity ()) ? 'P' : '-';
5534 f[6] = (carry ()) ? 'C' : '-';
5535 f[7] = '\x0';
5536 // output
5537 printf("%08x flags = %s", _value, f);
5538 }
5539
5540 };
5541
5542 class IU_Register {
5543 public:
5544 int32_t _value;
5545
5546 void print() const {
5547 printf("%08x %11d", _value, _value);
5548 }
5549
5550 };
5551
5552 class IU_State {
5553 public:
5554 Flag_Register _eflags;
5555 IU_Register _rdi;
5556 IU_Register _rsi;
5557 IU_Register _rbp;
5558 IU_Register _rsp;
5559 IU_Register _rbx;
5560 IU_Register _rdx;
5561 IU_Register _rcx;
5562 IU_Register _rax;
5563
5564 void print() const {
5565 // computation registers
5566 printf("rax, = "); _rax.print(); printf("\n");
5567 printf("rbx, = "); _rbx.print(); printf("\n");
5568 printf("rcx = "); _rcx.print(); printf("\n");
5569 printf("rdx = "); _rdx.print(); printf("\n");
5570 printf("rdi = "); _rdi.print(); printf("\n");
5571 printf("rsi = "); _rsi.print(); printf("\n");
5572 printf("rbp, = "); _rbp.print(); printf("\n");
5573 printf("rsp = "); _rsp.print(); printf("\n");
5574 printf("\n");
5575 // control registers
5576 printf("flgs = "); _eflags.print(); printf("\n");
5577 }
5578 };
5579
5580
5581 class CPU_State {
5582 public:
5583 FPU_State _fpu_state;
5584 IU_State _iu_state;
5585
5586 void print() const {
5587 printf("--------------------------------------------------\n");
5588 _iu_state .print();
5589 printf("\n");
5590 _fpu_state.print();
5591 printf("--------------------------------------------------\n");
5592 }
5593
5594 };
5595
5596
5597 static void _print_CPU_state(CPU_State* state) {
5598 state->print();
5599 };
5600
5601
5602 void MacroAssembler::print_CPU_state() {
5603 push_CPU_state();
5604 push(rsp); // pass CPU state
5605 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5606 addptr(rsp, wordSize); // discard argument
5607 pop_CPU_state();
5608 }
5609
5610 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5611 // Either restore the MXCSR register after returning from the JNI Call
5612 // or verify that it wasn't changed (with -Xcheck:jni flag).
5613 if (VM_Version::supports_sse()) {
5614 if (RestoreMXCSROnJNICalls) {
5615 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5616 } else if (CheckJNICalls) {
5617 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5618 }
5619 }
5620 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5621 vzeroupper();
5622 }
5623
5624 // ((OopHandle)result).resolve();
5625 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5626 assert_different_registers(result, tmp);
5627
5628 // Only 64 bit platforms support GCs that require a tmp register
5629 // Only IN_HEAP loads require a thread_tmp register
5630 // OopHandle::resolve is an indirection like jobject.
5631 access_load_at(T_OBJECT, IN_NATIVE,
5632 result, Address(result, 0), tmp);
5633 }
5634
5635 // ((WeakHandle)result).resolve();
5636 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5637 assert_different_registers(rresult, rtmp);
5638 Label resolved;
5639
5640 // A null weak handle resolves to null.
5641 cmpptr(rresult, 0);
5642 jcc(Assembler::equal, resolved);
5643
5644 // Only 64 bit platforms support GCs that require a tmp register
5645 // Only IN_HEAP loads require a thread_tmp register
5646 // WeakHandle::resolve is an indirection like jweak.
5647 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5648 rresult, Address(rresult, 0), rtmp);
5649 bind(resolved);
5650 }
5651
5652 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5653 // get mirror
5654 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5655 load_method_holder(mirror, method);
5656 movptr(mirror, Address(mirror, mirror_offset));
5657 resolve_oop_handle(mirror, tmp);
5658 }
5659
5660 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5661 load_method_holder(rresult, rmethod);
5662 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5663 }
5664
5665 void MacroAssembler::load_method_holder(Register holder, Register method) {
5666 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
5667 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5668 movptr(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5669 }
5670
5671 void MacroAssembler::load_metadata(Register dst, Register src) {
5672 if (UseCompactObjectHeaders) {
5673 load_narrow_klass_compact(dst, src);
5674 } else {
5675 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5676 }
5677 }
5678
5679 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
5680 assert(UseCompactObjectHeaders, "expect compact object headers");
5681 movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
5682 shrq(dst, markWord::klass_shift);
5683 }
5684
5685 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5686 assert_different_registers(src, tmp);
5687 assert_different_registers(dst, tmp);
5688
5689 if (UseCompactObjectHeaders) {
5690 load_narrow_klass_compact(dst, src);
5691 decode_klass_not_null(dst, tmp);
5692 } else {
5693 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5694 decode_klass_not_null(dst, tmp);
5695 }
5696 }
5697
5698 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
5699 load_klass(dst, src, tmp);
5700 movptr(dst, Address(dst, Klass::prototype_header_offset()));
5701 }
5702
5703 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5704 assert(!UseCompactObjectHeaders, "not with compact headers");
5705 assert_different_registers(src, tmp);
5706 assert_different_registers(dst, tmp);
5707 encode_klass_not_null(src, tmp);
5708 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5709 }
5710
5711 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) {
5712 if (UseCompactObjectHeaders) {
5713 assert(tmp != noreg, "need tmp");
5714 assert_different_registers(klass, obj, tmp);
5715 load_narrow_klass_compact(tmp, obj);
5716 cmpl(klass, tmp);
5717 } else {
5718 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
5719 }
5720 }
5721
5722 void MacroAssembler::cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2) {
5723 if (UseCompactObjectHeaders) {
5724 assert(tmp2 != noreg, "need tmp2");
5725 assert_different_registers(obj1, obj2, tmp1, tmp2);
5726 load_narrow_klass_compact(tmp1, obj1);
5727 load_narrow_klass_compact(tmp2, obj2);
5728 cmpl(tmp1, tmp2);
5729 } else {
5730 movl(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes()));
5731 cmpl(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes()));
5732 }
5733 }
5734
5735 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5736 Register tmp1) {
5737 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5738 decorators = AccessInternal::decorator_fixup(decorators, type);
5739 bool as_raw = (decorators & AS_RAW) != 0;
5740 if (as_raw) {
5741 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1);
5742 } else {
5743 bs->load_at(this, decorators, type, dst, src, tmp1);
5744 }
5745 }
5746
5747 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5748 Register tmp1, Register tmp2, Register tmp3) {
5749 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5750 decorators = AccessInternal::decorator_fixup(decorators, type);
5751 bool as_raw = (decorators & AS_RAW) != 0;
5752 if (as_raw) {
5753 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5754 } else {
5755 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5756 }
5757 }
5758
5759 void MacroAssembler::flat_field_copy(DecoratorSet decorators, Register src, Register dst,
5760 Register inline_layout_info) {
5761 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5762 bs->flat_field_copy(this, decorators, src, dst, inline_layout_info);
5763 }
5764
5765 void MacroAssembler::payload_offset(Register inline_klass, Register offset) {
5766 movptr(offset, Address(inline_klass, InlineKlass::adr_members_offset()));
5767 movl(offset, Address(offset, InlineKlass::payload_offset_offset()));
5768 }
5769
5770 void MacroAssembler::payload_addr(Register oop, Register data, Register inline_klass) {
5771 // ((address) (void*) o) + vk->payload_offset();
5772 Register offset = (data == oop) ? rscratch1 : data;
5773 payload_offset(inline_klass, offset);
5774 if (data == oop) {
5775 addptr(data, offset);
5776 } else {
5777 lea(data, Address(oop, offset));
5778 }
5779 }
5780
5781 void MacroAssembler::data_for_value_array_index(Register array, Register array_klass,
5782 Register index, Register data) {
5783 assert(index != rcx, "index needs to shift by rcx");
5784 assert_different_registers(array, array_klass, index);
5785 assert_different_registers(rcx, array, index);
5786
5787 // array->base() + (index << Klass::layout_helper_log2_element_size(lh));
5788 movl(rcx, Address(array_klass, Klass::layout_helper_offset()));
5789
5790 // Klass::layout_helper_log2_element_size(lh)
5791 // (lh >> _lh_log2_element_size_shift) & _lh_log2_element_size_mask;
5792 shrl(rcx, Klass::_lh_log2_element_size_shift);
5793 andl(rcx, Klass::_lh_log2_element_size_mask);
5794 shlptr(index); // index << rcx
5795
5796 lea(data, Address(array, index, Address::times_1, arrayOopDesc::base_offset_in_bytes(T_FLAT_ELEMENT)));
5797 }
5798
5799 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5800 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1);
5801 }
5802
5803 // Doesn't do verification, generates fixed size code
5804 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5805 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1);
5806 }
5807
5808 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5809 Register tmp2, Register tmp3, DecoratorSet decorators) {
5810 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5811 }
5812
5813 // Used for storing nulls.
5814 void MacroAssembler::store_heap_oop_null(Address dst) {
5815 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5816 }
5817
5818 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5819 assert(!UseCompactObjectHeaders, "Don't use with compact headers");
5820 // Store to klass gap in destination
5821 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5822 }
5823
5824 #ifdef ASSERT
5825 void MacroAssembler::verify_heapbase(const char* msg) {
5826 assert (UseCompressedOops, "should be compressed");
5827 assert (Universe::heap() != nullptr, "java heap should be initialized");
5828 if (CheckCompressedOops) {
5829 Label ok;
5830 ExternalAddress src2(CompressedOops::base_addr());
5831 const bool is_src2_reachable = reachable(src2);
5832 if (!is_src2_reachable) {
5833 push(rscratch1); // cmpptr trashes rscratch1
5834 }
5835 cmpptr(r12_heapbase, src2, rscratch1);
5836 jcc(Assembler::equal, ok);
5837 STOP(msg);
5838 bind(ok);
5839 if (!is_src2_reachable) {
5840 pop(rscratch1);
5841 }
5842 }
5843 }
5844 #endif
5845
5846 // Algorithm must match oop.inline.hpp encode_heap_oop.
5847 void MacroAssembler::encode_heap_oop(Register r) {
5848 #ifdef ASSERT
5849 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5850 #endif
5851 verify_oop_msg(r, "broken oop in encode_heap_oop");
5852 if (CompressedOops::base() == nullptr) {
5853 if (CompressedOops::shift() != 0) {
5854 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5855 shrq(r, LogMinObjAlignmentInBytes);
5856 }
5857 return;
5858 }
5859 testq(r, r);
5860 cmovq(Assembler::equal, r, r12_heapbase);
5861 subq(r, r12_heapbase);
5862 shrq(r, LogMinObjAlignmentInBytes);
5863 }
5864
5865 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5866 #ifdef ASSERT
5867 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5868 if (CheckCompressedOops) {
5869 Label ok;
5870 testq(r, r);
5871 jcc(Assembler::notEqual, ok);
5872 STOP("null oop passed to encode_heap_oop_not_null");
5873 bind(ok);
5874 }
5875 #endif
5876 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5877 if (CompressedOops::base() != nullptr) {
5878 subq(r, r12_heapbase);
5879 }
5880 if (CompressedOops::shift() != 0) {
5881 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5882 shrq(r, LogMinObjAlignmentInBytes);
5883 }
5884 }
5885
5886 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5887 #ifdef ASSERT
5888 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5889 if (CheckCompressedOops) {
5890 Label ok;
5891 testq(src, src);
5892 jcc(Assembler::notEqual, ok);
5893 STOP("null oop passed to encode_heap_oop_not_null2");
5894 bind(ok);
5895 }
5896 #endif
5897 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5898 if (dst != src) {
5899 movq(dst, src);
5900 }
5901 if (CompressedOops::base() != nullptr) {
5902 subq(dst, r12_heapbase);
5903 }
5904 if (CompressedOops::shift() != 0) {
5905 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5906 shrq(dst, LogMinObjAlignmentInBytes);
5907 }
5908 }
5909
5910 void MacroAssembler::decode_heap_oop(Register r) {
5911 #ifdef ASSERT
5912 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5913 #endif
5914 if (CompressedOops::base() == nullptr) {
5915 if (CompressedOops::shift() != 0) {
5916 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5917 shlq(r, LogMinObjAlignmentInBytes);
5918 }
5919 } else {
5920 Label done;
5921 shlq(r, LogMinObjAlignmentInBytes);
5922 jccb(Assembler::equal, done);
5923 addq(r, r12_heapbase);
5924 bind(done);
5925 }
5926 verify_oop_msg(r, "broken oop in decode_heap_oop");
5927 }
5928
5929 void MacroAssembler::decode_heap_oop_not_null(Register r) {
5930 // Note: it will change flags
5931 assert (UseCompressedOops, "should only be used for compressed headers");
5932 assert (Universe::heap() != nullptr, "java heap should be initialized");
5933 // Cannot assert, unverified entry point counts instructions (see .ad file)
5934 // vtableStubs also counts instructions in pd_code_size_limit.
5935 // Also do not verify_oop as this is called by verify_oop.
5936 if (CompressedOops::shift() != 0) {
5937 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5938 shlq(r, LogMinObjAlignmentInBytes);
5939 if (CompressedOops::base() != nullptr) {
5940 addq(r, r12_heapbase);
5941 }
5942 } else {
5943 assert (CompressedOops::base() == nullptr, "sanity");
5944 }
5945 }
5946
5947 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5948 // Note: it will change flags
5949 assert (UseCompressedOops, "should only be used for compressed headers");
5950 assert (Universe::heap() != nullptr, "java heap should be initialized");
5951 // Cannot assert, unverified entry point counts instructions (see .ad file)
5952 // vtableStubs also counts instructions in pd_code_size_limit.
5953 // Also do not verify_oop as this is called by verify_oop.
5954 if (CompressedOops::shift() != 0) {
5955 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5956 if (LogMinObjAlignmentInBytes == Address::times_8) {
5957 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5958 } else {
5959 if (dst != src) {
5960 movq(dst, src);
5961 }
5962 shlq(dst, LogMinObjAlignmentInBytes);
5963 if (CompressedOops::base() != nullptr) {
5964 addq(dst, r12_heapbase);
5965 }
5966 }
5967 } else {
5968 assert (CompressedOops::base() == nullptr, "sanity");
5969 if (dst != src) {
5970 movq(dst, src);
5971 }
5972 }
5973 }
5974
5975 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5976 BLOCK_COMMENT("encode_klass_not_null {");
5977 assert_different_registers(r, tmp);
5978 if (CompressedKlassPointers::base() != nullptr) {
5979 if (AOTCodeCache::is_on_for_dump()) {
5980 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5981 } else {
5982 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5983 }
5984 subq(r, tmp);
5985 }
5986 if (CompressedKlassPointers::shift() != 0) {
5987 shrq(r, CompressedKlassPointers::shift());
5988 }
5989 BLOCK_COMMENT("} encode_klass_not_null");
5990 }
5991
5992 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5993 BLOCK_COMMENT("encode_and_move_klass_not_null {");
5994 assert_different_registers(src, dst);
5995 if (CompressedKlassPointers::base() != nullptr) {
5996 if (AOTCodeCache::is_on_for_dump()) {
5997 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5998 negq(dst);
5999 } else {
6000 movptr(dst, -(intptr_t)CompressedKlassPointers::base());
6001 }
6002 addq(dst, src);
6003 } else {
6004 movptr(dst, src);
6005 }
6006 if (CompressedKlassPointers::shift() != 0) {
6007 shrq(dst, CompressedKlassPointers::shift());
6008 }
6009 BLOCK_COMMENT("} encode_and_move_klass_not_null");
6010 }
6011
6012 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
6013 BLOCK_COMMENT("decode_klass_not_null {");
6014 assert_different_registers(r, tmp);
6015 // Note: it will change flags
6016 // Cannot assert, unverified entry point counts instructions (see .ad file)
6017 // vtableStubs also counts instructions in pd_code_size_limit.
6018 // Also do not verify_oop as this is called by verify_oop.
6019 if (CompressedKlassPointers::shift() != 0) {
6020 shlq(r, CompressedKlassPointers::shift());
6021 }
6022 if (CompressedKlassPointers::base() != nullptr) {
6023 if (AOTCodeCache::is_on_for_dump()) {
6024 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
6025 } else {
6026 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
6027 }
6028 addq(r, tmp);
6029 }
6030 BLOCK_COMMENT("} decode_klass_not_null");
6031 }
6032
6033 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
6034 BLOCK_COMMENT("decode_and_move_klass_not_null {");
6035 assert_different_registers(src, dst);
6036 // Note: it will change flags
6037 // Cannot assert, unverified entry point counts instructions (see .ad file)
6038 // vtableStubs also counts instructions in pd_code_size_limit.
6039 // Also do not verify_oop as this is called by verify_oop.
6040
6041 if (CompressedKlassPointers::base() == nullptr &&
6042 CompressedKlassPointers::shift() == 0) {
6043 // The best case scenario is that there is no base or shift. Then it is already
6044 // a pointer that needs nothing but a register rename.
6045 movl(dst, src);
6046 } else {
6047 if (CompressedKlassPointers::shift() <= Address::times_8) {
6048 if (CompressedKlassPointers::base() != nullptr) {
6049 if (AOTCodeCache::is_on_for_dump()) {
6050 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
6051 } else {
6052 movptr(dst, (intptr_t)CompressedKlassPointers::base());
6053 }
6054 } else {
6055 xorq(dst, dst);
6056 }
6057 if (CompressedKlassPointers::shift() != 0) {
6058 assert(CompressedKlassPointers::shift() == Address::times_8, "klass not aligned on 64bits?");
6059 leaq(dst, Address(dst, src, Address::times_8, 0));
6060 } else {
6061 addq(dst, src);
6062 }
6063 } else {
6064 if (CompressedKlassPointers::base() != nullptr) {
6065 if (AOTCodeCache::is_on_for_dump()) {
6066 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
6067 shrq(dst, CompressedKlassPointers::shift());
6068 } else {
6069 const intptr_t base_right_shifted =
6070 (intptr_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
6071 movptr(dst, base_right_shifted);
6072 }
6073 } else {
6074 xorq(dst, dst);
6075 }
6076 addq(dst, src);
6077 shlq(dst, CompressedKlassPointers::shift());
6078 }
6079 }
6080 BLOCK_COMMENT("} decode_and_move_klass_not_null");
6081 }
6082
6083 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
6084 assert (UseCompressedOops, "should only be used for compressed headers");
6085 assert (Universe::heap() != nullptr, "java heap should be initialized");
6086 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6087 int oop_index = oop_recorder()->find_index(obj);
6088 RelocationHolder rspec = oop_Relocation::spec(oop_index);
6089 mov_narrow_oop(dst, oop_index, rspec);
6090 }
6091
6092 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
6093 assert (UseCompressedOops, "should only be used for compressed headers");
6094 assert (Universe::heap() != nullptr, "java heap should be initialized");
6095 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6096 int oop_index = oop_recorder()->find_index(obj);
6097 RelocationHolder rspec = oop_Relocation::spec(oop_index);
6098 mov_narrow_oop(dst, oop_index, rspec);
6099 }
6100
6101 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
6102 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6103 int klass_index = oop_recorder()->find_index(k);
6104 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6105 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6106 }
6107
6108 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
6109 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6110 int klass_index = oop_recorder()->find_index(k);
6111 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6112 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6113 }
6114
6115 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
6116 assert (UseCompressedOops, "should only be used for compressed headers");
6117 assert (Universe::heap() != nullptr, "java heap should be initialized");
6118 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6119 int oop_index = oop_recorder()->find_index(obj);
6120 RelocationHolder rspec = oop_Relocation::spec(oop_index);
6121 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6122 }
6123
6124 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
6125 assert (UseCompressedOops, "should only be used for compressed headers");
6126 assert (Universe::heap() != nullptr, "java heap should be initialized");
6127 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6128 int oop_index = oop_recorder()->find_index(obj);
6129 RelocationHolder rspec = oop_Relocation::spec(oop_index);
6130 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6131 }
6132
6133 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
6134 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6135 int klass_index = oop_recorder()->find_index(k);
6136 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6137 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6138 }
6139
6140 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
6141 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6142 int klass_index = oop_recorder()->find_index(k);
6143 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6144 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6145 }
6146
6147 void MacroAssembler::reinit_heapbase() {
6148 if (UseCompressedOops) {
6149 if (Universe::heap() != nullptr && !AOTCodeCache::is_on_for_dump()) {
6150 if (CompressedOops::base() == nullptr) {
6151 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
6152 } else {
6153 mov64(r12_heapbase, (int64_t)CompressedOops::base());
6154 }
6155 } else {
6156 movptr(r12_heapbase, ExternalAddress(CompressedOops::base_addr()));
6157 }
6158 }
6159 }
6160
6161 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) {
6162 assert(InlineTypeReturnedAsFields, "Inline types should never be returned as fields");
6163 // An inline type might be returned. If fields are in registers we
6164 // need to allocate an inline type instance and initialize it with
6165 // the value of the fields.
6166 Label skip;
6167 // We only need a new buffered inline type if a new one is not returned
6168 testptr(rax, 1);
6169 jcc(Assembler::zero, skip);
6170 int call_offset = -1;
6171
6172 #ifdef _LP64
6173 // The following code is similar to allocate_instance but has some slight differences,
6174 // e.g. object size is always not zero, sometimes it's constant; storing klass ptr after
6175 // allocating is not necessary if vk != nullptr, etc. allocate_instance is not aware of these.
6176 Label slow_case;
6177 // 1. Try to allocate a new buffered inline instance either from TLAB or eden space
6178 mov(rscratch1, rax); // save rax for slow_case since *_allocate may corrupt it when allocation failed
6179 if (vk != nullptr) {
6180 // Called from C1, where the return type is statically known.
6181 movptr(rbx, (intptr_t)vk->get_InlineKlass());
6182 jint lh = vk->layout_helper();
6183 assert(lh != Klass::_lh_neutral_value, "inline class in return type must have been resolved");
6184 if (UseTLAB && !Klass::layout_helper_needs_slow_path(lh)) {
6185 tlab_allocate(rax, noreg, lh, r13, r14, slow_case);
6186 } else {
6187 jmp(slow_case);
6188 }
6189 } else {
6190 // Call from interpreter. RAX contains ((the InlineKlass* of the return type) | 0x01)
6191 mov(rbx, rax);
6192 andptr(rbx, -2);
6193 if (UseTLAB) {
6194 movl(r14, Address(rbx, Klass::layout_helper_offset()));
6195 testl(r14, Klass::_lh_instance_slow_path_bit);
6196 jcc(Assembler::notZero, slow_case);
6197 tlab_allocate(rax, r14, 0, r13, r14, slow_case);
6198 } else {
6199 jmp(slow_case);
6200 }
6201 }
6202 if (UseTLAB) {
6203 // 2. Initialize buffered inline instance header
6204 Register buffer_obj = rax;
6205 Register klass = rbx;
6206 if (UseCompactObjectHeaders) {
6207 Register mark_word = r13;
6208 movptr(mark_word, Address(klass, Klass::prototype_header_offset()));
6209 movptr(Address(buffer_obj, oopDesc::mark_offset_in_bytes()), mark_word);
6210 } else {
6211 movptr(Address(buffer_obj, oopDesc::mark_offset_in_bytes()), (intptr_t)markWord::inline_type_prototype().value());
6212 xorl(r13, r13);
6213 store_klass_gap(buffer_obj, r13);
6214 if (vk == nullptr) {
6215 // store_klass corrupts rbx(klass), so save it in r13 for later use (interpreter case only).
6216 mov(r13, klass);
6217 }
6218 store_klass(buffer_obj, klass, rscratch1);
6219 klass = r13;
6220 }
6221 // 3. Initialize its fields with an inline class specific handler
6222 if (vk != nullptr) {
6223 call(RuntimeAddress(vk->pack_handler())); // no need for call info as this will not safepoint.
6224 } else {
6225 movptr(rbx, Address(klass, InlineKlass::adr_members_offset()));
6226 movptr(rbx, Address(rbx, InlineKlass::pack_handler_offset()));
6227 call(rbx);
6228 }
6229 jmp(skip);
6230 }
6231 bind(slow_case);
6232 // We failed to allocate a new inline type, fall back to a runtime
6233 // call. Some oop field may be live in some registers but we can't
6234 // tell. That runtime call will take care of preserving them
6235 // across a GC if there's one.
6236 mov(rax, rscratch1);
6237 #endif
6238
6239 if (from_interpreter) {
6240 super_call_VM_leaf(StubRoutines::store_inline_type_fields_to_buf());
6241 } else {
6242 call(RuntimeAddress(StubRoutines::store_inline_type_fields_to_buf()));
6243 call_offset = offset();
6244 }
6245
6246 bind(skip);
6247 return call_offset;
6248 }
6249
6250 // Move a value between registers/stack slots and update the reg_state
6251 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
6252 assert(from->is_valid() && to->is_valid(), "source and destination must be valid");
6253 if (reg_state[to->value()] == reg_written) {
6254 return true; // Already written
6255 }
6256 if (from != to && bt != T_VOID) {
6257 if (reg_state[to->value()] == reg_readonly) {
6258 return false; // Not yet writable
6259 }
6260 if (from->is_reg()) {
6261 if (to->is_reg()) {
6262 if (from->is_XMMRegister()) {
6263 if (bt == T_DOUBLE) {
6264 movdbl(to->as_XMMRegister(), from->as_XMMRegister());
6265 } else {
6266 assert(bt == T_FLOAT, "must be float");
6267 movflt(to->as_XMMRegister(), from->as_XMMRegister());
6268 }
6269 } else {
6270 movq(to->as_Register(), from->as_Register());
6271 }
6272 } else {
6273 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6274 Address to_addr = Address(rsp, st_off);
6275 if (from->is_XMMRegister()) {
6276 if (bt == T_DOUBLE) {
6277 movdbl(to_addr, from->as_XMMRegister());
6278 } else {
6279 assert(bt == T_FLOAT, "must be float");
6280 movflt(to_addr, from->as_XMMRegister());
6281 }
6282 } else {
6283 movq(to_addr, from->as_Register());
6284 }
6285 }
6286 } else {
6287 Address from_addr = Address(rsp, from->reg2stack() * VMRegImpl::stack_slot_size + wordSize);
6288 if (to->is_reg()) {
6289 if (to->is_XMMRegister()) {
6290 if (bt == T_DOUBLE) {
6291 movdbl(to->as_XMMRegister(), from_addr);
6292 } else {
6293 assert(bt == T_FLOAT, "must be float");
6294 movflt(to->as_XMMRegister(), from_addr);
6295 }
6296 } else {
6297 movq(to->as_Register(), from_addr);
6298 }
6299 } else {
6300 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6301 movq(r13, from_addr);
6302 movq(Address(rsp, st_off), r13);
6303 }
6304 }
6305 }
6306 // Update register states
6307 reg_state[from->value()] = reg_writable;
6308 reg_state[to->value()] = reg_written;
6309 return true;
6310 }
6311
6312 // Calculate the extra stack space required for packing or unpacking inline
6313 // args and adjust the stack pointer (see MacroAssembler::remove_frame).
6314 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
6315 int sp_inc = args_on_stack * VMRegImpl::stack_slot_size;
6316 sp_inc = align_up(sp_inc, StackAlignmentInBytes);
6317 assert(sp_inc > 0, "sanity");
6318 // Two additional slots to account for return address
6319 sp_inc += 2 * VMRegImpl::stack_slot_size;
6320
6321 push(rbp);
6322 subptr(rsp, sp_inc);
6323 #ifdef ASSERT
6324 movl(Address(rsp, 0), badRegWordVal);
6325 movl(Address(rsp, VMRegImpl::stack_slot_size), badRegWordVal);
6326 #endif
6327 return sp_inc + wordSize; // account for rbp space
6328 }
6329
6330 // Read all fields from an inline type buffer and store the field values in registers/stack slots.
6331 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
6332 VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
6333 RegState reg_state[]) {
6334 assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter");
6335 assert(from->is_valid(), "source must be valid");
6336 bool progress = false;
6337 #ifdef ASSERT
6338 const int start_offset = offset();
6339 #endif
6340
6341 Label L_null, L_notNull;
6342 // Don't use r14 as tmp because it's used for spilling (see MacroAssembler::spill_reg_for)
6343 Register tmp1 = r10;
6344 Register tmp2 = r13;
6345 Register fromReg = noreg;
6346 ScalarizedInlineArgsStream stream(sig, sig_index, to, to_count, to_index, true);
6347 bool done = true;
6348 bool mark_done = true;
6349 VMReg toReg;
6350 BasicType bt;
6351 // Check if argument requires a null check
6352 bool null_check = false;
6353 VMReg nullCheckReg;
6354 while (stream.next(nullCheckReg, bt)) {
6355 if (sig->at(stream.sig_index())._offset == -1) {
6356 null_check = true;
6357 break;
6358 }
6359 }
6360 stream.reset(sig_index, to_index);
6361 while (stream.next(toReg, bt)) {
6362 assert(toReg->is_valid(), "destination must be valid");
6363 int idx = (int)toReg->value();
6364 if (reg_state[idx] == reg_readonly) {
6365 if (idx != from->value()) {
6366 mark_done = false;
6367 }
6368 done = false;
6369 continue;
6370 } else if (reg_state[idx] == reg_written) {
6371 continue;
6372 }
6373 assert(reg_state[idx] == reg_writable, "must be writable");
6374 reg_state[idx] = reg_written;
6375 progress = true;
6376
6377 if (fromReg == noreg) {
6378 if (from->is_reg()) {
6379 fromReg = from->as_Register();
6380 } else {
6381 int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6382 movq(tmp1, Address(rsp, st_off));
6383 fromReg = tmp1;
6384 }
6385 if (null_check) {
6386 // Nullable inline type argument, emit null check
6387 testptr(fromReg, fromReg);
6388 jcc(Assembler::zero, L_null);
6389 }
6390 }
6391 int off = sig->at(stream.sig_index())._offset;
6392 if (off == -1) {
6393 assert(null_check, "Missing null check at");
6394 if (toReg->is_stack()) {
6395 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6396 movq(Address(rsp, st_off), 1);
6397 } else {
6398 movq(toReg->as_Register(), 1);
6399 }
6400 continue;
6401 }
6402 if (sig->at(stream.sig_index())._vt_oop) {
6403 if (toReg->is_stack()) {
6404 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6405 movq(Address(rsp, st_off), fromReg);
6406 } else {
6407 movq(toReg->as_Register(), fromReg);
6408 }
6409 continue;
6410 }
6411 assert(off > 0, "offset in object should be positive");
6412 Address fromAddr = Address(fromReg, off);
6413 if (!toReg->is_XMMRegister()) {
6414 Register dst = toReg->is_stack() ? tmp2 : toReg->as_Register();
6415 if (is_reference_type(bt)) {
6416 load_heap_oop(dst, fromAddr);
6417 } else {
6418 bool is_signed = (bt != T_CHAR) && (bt != T_BOOLEAN);
6419 load_sized_value(dst, fromAddr, type2aelembytes(bt), is_signed);
6420 }
6421 if (toReg->is_stack()) {
6422 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6423 movq(Address(rsp, st_off), dst);
6424 }
6425 } else if (bt == T_DOUBLE) {
6426 movdbl(toReg->as_XMMRegister(), fromAddr);
6427 } else {
6428 assert(bt == T_FLOAT, "must be float");
6429 movflt(toReg->as_XMMRegister(), fromAddr);
6430 }
6431 }
6432 if (progress && null_check) {
6433 if (done) {
6434 jmp(L_notNull);
6435 bind(L_null);
6436 // Set null marker to zero to signal that the argument is null.
6437 // Also set all fields to zero since the runtime requires a canonical
6438 // representation of a flat null.
6439 stream.reset(sig_index, to_index);
6440 while (stream.next(toReg, bt)) {
6441 if (toReg->is_stack()) {
6442 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6443 movq(Address(rsp, st_off), 0);
6444 } else if (toReg->is_XMMRegister()) {
6445 xorps(toReg->as_XMMRegister(), toReg->as_XMMRegister());
6446 } else {
6447 xorl(toReg->as_Register(), toReg->as_Register());
6448 }
6449 }
6450 bind(L_notNull);
6451 } else {
6452 bind(L_null);
6453 }
6454 }
6455
6456 sig_index = stream.sig_index();
6457 to_index = stream.regs_index();
6458
6459 if (mark_done && reg_state[from->value()] != reg_written) {
6460 // This is okay because no one else will write to that slot
6461 reg_state[from->value()] = reg_writable;
6462 }
6463 from_index--;
6464 assert(progress || (start_offset == offset()), "should not emit code");
6465 return done;
6466 }
6467
6468 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
6469 VMRegPair* from, int from_count, int& from_index, VMReg to,
6470 RegState reg_state[], Register val_array) {
6471 assert(sig->at(sig_index)._bt == T_METADATA, "should be at delimiter");
6472 assert(to->is_valid(), "destination must be valid");
6473
6474 if (reg_state[to->value()] == reg_written) {
6475 skip_unpacked_fields(sig, sig_index, from, from_count, from_index);
6476 return true; // Already written
6477 }
6478
6479 // Be careful with r14 because it's used for spilling (see MacroAssembler::spill_reg_for).
6480 Register val_obj_tmp = r11;
6481 Register from_reg_tmp = r14;
6482 Register tmp1 = r10;
6483 Register tmp2 = r13;
6484 Register tmp3 = rbx;
6485 Register val_obj = to->is_stack() ? val_obj_tmp : to->as_Register();
6486
6487 assert_different_registers(val_obj_tmp, from_reg_tmp, tmp1, tmp2, tmp3, val_array);
6488
6489 if (reg_state[to->value()] == reg_readonly) {
6490 if (!is_reg_in_unpacked_fields(sig, sig_index, to, from, from_count, from_index)) {
6491 skip_unpacked_fields(sig, sig_index, from, from_count, from_index);
6492 return false; // Not yet writable
6493 }
6494 val_obj = val_obj_tmp;
6495 }
6496
6497 ScalarizedInlineArgsStream stream(sig, sig_index, from, from_count, from_index);
6498 VMReg fromReg;
6499 BasicType bt;
6500 Label L_null;
6501 while (stream.next(fromReg, bt)) {
6502 assert(fromReg->is_valid(), "source must be valid");
6503 reg_state[fromReg->value()] = reg_writable;
6504
6505 int off = sig->at(stream.sig_index())._offset;
6506 if (off == -1) {
6507 // Nullable inline type argument, emit null check
6508 Label L_notNull;
6509 if (fromReg->is_stack()) {
6510 int ld_off = fromReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6511 testb(Address(rsp, ld_off), 1);
6512 } else {
6513 testb(fromReg->as_Register(), 1);
6514 }
6515 jcc(Assembler::notZero, L_notNull);
6516 movptr(val_obj, 0);
6517 jmp(L_null);
6518 bind(L_notNull);
6519 continue;
6520 }
6521 if (sig->at(stream.sig_index())._vt_oop) {
6522 // buffer argument: use if non null
6523 if (fromReg->is_stack()) {
6524 int ld_off = fromReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6525 movptr(val_obj, Address(rsp, ld_off));
6526 } else {
6527 movptr(val_obj, fromReg->as_Register());
6528 }
6529 testptr(val_obj, val_obj);
6530 jcc(Assembler::notEqual, L_null);
6531 // otherwise get the buffer from the just allocated pool of buffers
6532 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + vtarg_index * type2aelembytes(T_OBJECT);
6533 load_heap_oop(val_obj, Address(val_array, index));
6534 continue;
6535 }
6536
6537 assert(off > 0, "offset in object should be positive");
6538 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
6539
6540 // Pack the scalarized field into the value object.
6541 Address dst(val_obj, off);
6542 if (!fromReg->is_XMMRegister()) {
6543 Register src;
6544 if (fromReg->is_stack()) {
6545 src = from_reg_tmp;
6546 int ld_off = fromReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6547 load_sized_value(src, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
6548 } else {
6549 src = fromReg->as_Register();
6550 }
6551 assert_different_registers(dst.base(), src, tmp1, tmp2, tmp3, val_array);
6552 if (is_reference_type(bt)) {
6553 // store_heap_oop transitively calls oop_store_at which corrupts to.base(). We need to keep val_obj valid.
6554 mov(tmp3, val_obj);
6555 Address dst_with_tmp3(tmp3, off);
6556 store_heap_oop(dst_with_tmp3, src, tmp1, tmp2, tmp3, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
6557 } else {
6558 store_sized_value(dst, src, size_in_bytes);
6559 }
6560 } else if (bt == T_DOUBLE) {
6561 movdbl(dst, fromReg->as_XMMRegister());
6562 } else {
6563 assert(bt == T_FLOAT, "must be float");
6564 movflt(dst, fromReg->as_XMMRegister());
6565 }
6566 }
6567 bind(L_null);
6568 sig_index = stream.sig_index();
6569 from_index = stream.regs_index();
6570
6571 assert(reg_state[to->value()] == reg_writable, "must have already been read");
6572 bool success = move_helper(val_obj->as_VMReg(), to, T_OBJECT, reg_state);
6573 assert(success, "to register must be writeable");
6574 return true;
6575 }
6576
6577 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
6578 return reg->is_XMMRegister() ? xmm8->as_VMReg() : r14->as_VMReg();
6579 }
6580
6581 void MacroAssembler::remove_frame(int initial_framesize, bool needs_stack_repair) {
6582 assert((initial_framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
6583 if (needs_stack_repair) {
6584 // The method has a scalarized entry point (where fields of value object arguments
6585 // are passed through registers and stack), and a non-scalarized entry point (where
6586 // value object arguments are given as oops). The non-scalarized entry point will
6587 // first load each field of value object arguments and store them in registers and on
6588 // the stack in a way compatible with the scalarized entry point. To do so, some extra
6589 // stack space might be reserved (if argument registers are not enough). On leaving the
6590 // method, this space must be freed.
6591 //
6592 // In case we used the non-scalarized entry point the stack looks like this:
6593 //
6594 // | Arguments from caller |
6595 // |---------------------------| <-- caller's SP
6596 // | Return address #1 |
6597 // | Saved RBP #1 |
6598 // |---------------------------|
6599 // | Extension space for |
6600 // | inline arg (un)packing |
6601 // |---------------------------| <-- start of this method's frame
6602 // | Return address #2 |
6603 // | Saved RBP #2 |
6604 // |---------------------------| <-- RBP (with -XX:+PreserveFramePointer)
6605 // | sp_inc |
6606 // | method locals |
6607 // |---------------------------| <-- SP
6608 //
6609 // Space for the return pc and saved rbp is reserved twice. But only the #1 copies
6610 // contain the real values of return pc and saved rbp. The #2 copies are not reliable
6611 // and should not be used. They are mostly needed to add space between the extension
6612 // space and the locals, as there would be between the real arguments and the locals
6613 // if we don't need to do unpacking (from the scalarized entry point).
6614 //
6615 // When leaving, one must load RBP #1 into RBP, and use the copy #1 of the return address,
6616 // while keeping in mind that from the scalarized entry point, there will be only one
6617 // copy. Indeed, in the case we used the scalarized calling convention, the stack looks like this:
6618 //
6619 // | Arguments from caller |
6620 // |---------------------------| <-- caller's SP
6621 // | Return address |
6622 // | Saved RBP |
6623 // |---------------------------| <-- FP (with -XX:+PreserveFramePointer)
6624 // | sp_inc |
6625 // | method locals |
6626 // |---------------------------| <-- SP
6627 //
6628 // The sp_inc stack slot holds the total size of the frame, including the extension
6629 // space and copies #2 of the return address and the saved RBP (but never the copies
6630 // #1 of the return address and saved RBP). That is how to find the copies #1 of the
6631 // return address and saved rbp. This size is expressed in bytes. Be careful when using
6632 // it from C++ in pointer arithmetic you might need to divide it by wordSize.
6633
6634 // The stack increment resides just below the saved rbp
6635 addq(rsp, Address(rsp, initial_framesize - wordSize));
6636 pop(rbp);
6637 } else {
6638 if (initial_framesize > 0) {
6639 addq(rsp, initial_framesize);
6640 }
6641 pop(rbp);
6642 }
6643 }
6644
6645 #if COMPILER2_OR_JVMCI
6646
6647 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
6648 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, KRegister mask) {
6649 // cnt - number of qwords (8-byte words).
6650 // base - start address, qword aligned.
6651 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
6652 bool use64byteVector = (MaxVectorSize == 64) && (CopyAVX3Threshold == 0);
6653 if (use64byteVector) {
6654 evpbroadcastq(xtmp, val, AVX_512bit);
6655 } else if (MaxVectorSize >= 32) {
6656 movdq(xtmp, val);
6657 punpcklqdq(xtmp, xtmp);
6658 vinserti128_high(xtmp, xtmp);
6659 } else {
6660 movdq(xtmp, val);
6661 punpcklqdq(xtmp, xtmp);
6662 }
6663 jmp(L_zero_64_bytes);
6664
6665 BIND(L_loop);
6666 if (MaxVectorSize >= 32) {
6667 fill64(base, 0, xtmp, use64byteVector);
6668 } else {
6669 movdqu(Address(base, 0), xtmp);
6670 movdqu(Address(base, 16), xtmp);
6671 movdqu(Address(base, 32), xtmp);
6672 movdqu(Address(base, 48), xtmp);
6673 }
6674 addptr(base, 64);
6675
6676 BIND(L_zero_64_bytes);
6677 subptr(cnt, 8);
6678 jccb(Assembler::greaterEqual, L_loop);
6679
6680 // Copy trailing 64 bytes
6681 if (use64byteVector) {
6682 addptr(cnt, 8);
6683 jccb(Assembler::equal, L_end);
6684 fill64_masked(3, base, 0, xtmp, mask, cnt, val, true);
6685 jmp(L_end);
6686 } else {
6687 addptr(cnt, 4);
6688 jccb(Assembler::less, L_tail);
6689 if (MaxVectorSize >= 32) {
6690 vmovdqu(Address(base, 0), xtmp);
6691 } else {
6692 movdqu(Address(base, 0), xtmp);
6693 movdqu(Address(base, 16), xtmp);
6694 }
6695 }
6696 addptr(base, 32);
6697 subptr(cnt, 4);
6698
6699 BIND(L_tail);
6700 addptr(cnt, 4);
6701 jccb(Assembler::lessEqual, L_end);
6702 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
6703 fill32_masked(3, base, 0, xtmp, mask, cnt, val);
6704 } else {
6705 decrement(cnt);
6706
6707 BIND(L_sloop);
6708 movq(Address(base, 0), xtmp);
6709 addptr(base, 8);
6710 decrement(cnt);
6711 jccb(Assembler::greaterEqual, L_sloop);
6712 }
6713 BIND(L_end);
6714 }
6715
6716 // Clearing constant sized memory using YMM/ZMM registers.
6717 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
6718 assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
6719 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
6720
6721 int vector64_count = (cnt & (~0x7)) >> 3;
6722 cnt = cnt & 0x7;
6723 const int fill64_per_loop = 4;
6724 const int max_unrolled_fill64 = 8;
6725
6726 // 64 byte initialization loop.
6727 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
6728 int start64 = 0;
6729 if (vector64_count > max_unrolled_fill64) {
6730 Label LOOP;
6731 Register index = rtmp;
6732
6733 start64 = vector64_count - (vector64_count % fill64_per_loop);
6734
6735 movl(index, 0);
6736 BIND(LOOP);
6737 for (int i = 0; i < fill64_per_loop; i++) {
6738 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
6739 }
6740 addl(index, fill64_per_loop * 64);
6741 cmpl(index, start64 * 64);
6742 jccb(Assembler::less, LOOP);
6743 }
6744 for (int i = start64; i < vector64_count; i++) {
6745 fill64(base, i * 64, xtmp, use64byteVector);
6746 }
6747
6748 // Clear remaining 64 byte tail.
6749 int disp = vector64_count * 64;
6750 if (cnt) {
6751 switch (cnt) {
6752 case 1:
6753 movq(Address(base, disp), xtmp);
6754 break;
6755 case 2:
6756 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
6757 break;
6758 case 3:
6759 movl(rtmp, 0x7);
6760 kmovwl(mask, rtmp);
6761 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
6762 break;
6763 case 4:
6764 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6765 break;
6766 case 5:
6767 if (use64byteVector) {
6768 movl(rtmp, 0x1F);
6769 kmovwl(mask, rtmp);
6770 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6771 } else {
6772 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6773 movq(Address(base, disp + 32), xtmp);
6774 }
6775 break;
6776 case 6:
6777 if (use64byteVector) {
6778 movl(rtmp, 0x3F);
6779 kmovwl(mask, rtmp);
6780 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6781 } else {
6782 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6783 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
6784 }
6785 break;
6786 case 7:
6787 if (use64byteVector) {
6788 movl(rtmp, 0x7F);
6789 kmovwl(mask, rtmp);
6790 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6791 } else {
6792 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6793 movl(rtmp, 0x7);
6794 kmovwl(mask, rtmp);
6795 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
6796 }
6797 break;
6798 default:
6799 fatal("Unexpected length : %d\n",cnt);
6800 break;
6801 }
6802 }
6803 }
6804
6805 void MacroAssembler::clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp,
6806 bool is_large, bool word_copy_only, KRegister mask) {
6807 // cnt - number of qwords (8-byte words).
6808 // base - start address, qword aligned.
6809 // is_large - if optimizers know cnt is larger than InitArrayShortSize
6810 assert(base==rdi, "base register must be edi for rep stos");
6811 assert(val==rax, "val register must be eax for rep stos");
6812 assert(cnt==rcx, "cnt register must be ecx for rep stos");
6813 assert(InitArrayShortSize % BytesPerLong == 0,
6814 "InitArrayShortSize should be the multiple of BytesPerLong");
6815
6816 Label DONE;
6817
6818 if (!is_large) {
6819 Label LOOP, LONG;
6820 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6821 jccb(Assembler::greater, LONG);
6822
6823 decrement(cnt);
6824 jccb(Assembler::negative, DONE); // Zero length
6825
6826 // Use individual pointer-sized stores for small counts:
6827 BIND(LOOP);
6828 movptr(Address(base, cnt, Address::times_ptr), val);
6829 decrement(cnt);
6830 jccb(Assembler::greaterEqual, LOOP);
6831 jmpb(DONE);
6832
6833 BIND(LONG);
6834 }
6835
6836 // Use longer rep-prefixed ops for non-small counts:
6837 if (UseFastStosb && !word_copy_only) {
6838 shlptr(cnt, 3); // convert to number of bytes
6839 rep_stosb();
6840 } else if (UseXMMForObjInit) {
6841 xmm_clear_mem(base, cnt, val, xtmp, mask);
6842 } else {
6843 rep_stos();
6844 }
6845
6846 BIND(DONE);
6847 }
6848
6849 #endif //COMPILER2_OR_JVMCI
6850
6851
6852 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6853 Register to, Register value, Register count,
6854 Register rtmp, XMMRegister xtmp) {
6855 ShortBranchVerifier sbv(this);
6856 assert_different_registers(to, value, count, rtmp);
6857 Label L_exit;
6858 Label L_fill_2_bytes, L_fill_4_bytes;
6859
6860 #if defined(COMPILER2)
6861 if(MaxVectorSize >=32 &&
6862 VM_Version::supports_avx512vlbw() &&
6863 VM_Version::supports_bmi2()) {
6864 generate_fill_avx3(t, to, value, count, rtmp, xtmp);
6865 return;
6866 }
6867 #endif
6868
6869 int shift = -1;
6870 switch (t) {
6871 case T_BYTE:
6872 shift = 2;
6873 break;
6874 case T_SHORT:
6875 shift = 1;
6876 break;
6877 case T_INT:
6878 shift = 0;
6879 break;
6880 default: ShouldNotReachHere();
6881 }
6882
6883 if (t == T_BYTE) {
6884 andl(value, 0xff);
6885 movl(rtmp, value);
6886 shll(rtmp, 8);
6887 orl(value, rtmp);
6888 }
6889 if (t == T_SHORT) {
6890 andl(value, 0xffff);
6891 }
6892 if (t == T_BYTE || t == T_SHORT) {
6893 movl(rtmp, value);
6894 shll(rtmp, 16);
6895 orl(value, rtmp);
6896 }
6897
6898 cmpptr(count, 8 << shift); // Short arrays (< 32 bytes) fill by element
6899 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6900 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6901 Label L_skip_align2;
6902 // align source address at 4 bytes address boundary
6903 if (t == T_BYTE) {
6904 Label L_skip_align1;
6905 // One byte misalignment happens only for byte arrays
6906 testptr(to, 1);
6907 jccb(Assembler::zero, L_skip_align1);
6908 movb(Address(to, 0), value);
6909 increment(to);
6910 decrement(count);
6911 BIND(L_skip_align1);
6912 }
6913 // Two bytes misalignment happens only for byte and short (char) arrays
6914 testptr(to, 2);
6915 jccb(Assembler::zero, L_skip_align2);
6916 movw(Address(to, 0), value);
6917 addptr(to, 2);
6918 subptr(count, 1<<(shift-1));
6919 BIND(L_skip_align2);
6920 }
6921 {
6922 Label L_fill_32_bytes;
6923 if (!UseUnalignedLoadStores) {
6924 // align to 8 bytes, we know we are 4 byte aligned to start
6925 testptr(to, 4);
6926 jccb(Assembler::zero, L_fill_32_bytes);
6927 movl(Address(to, 0), value);
6928 addptr(to, 4);
6929 subptr(count, 1<<shift);
6930 }
6931 BIND(L_fill_32_bytes);
6932 {
6933 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6934 movdl(xtmp, value);
6935 if (UseAVX >= 2 && UseUnalignedLoadStores) {
6936 Label L_check_fill_32_bytes;
6937 if (UseAVX > 2) {
6938 // Fill 64-byte chunks
6939 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
6940
6941 // If number of bytes to fill < CopyAVX3Threshold, perform fill using AVX2
6942 cmpptr(count, CopyAVX3Threshold);
6943 jccb(Assembler::below, L_check_fill_64_bytes_avx2);
6944
6945 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
6946
6947 subptr(count, 16 << shift);
6948 jcc(Assembler::less, L_check_fill_32_bytes);
6949 align(16);
6950
6951 BIND(L_fill_64_bytes_loop_avx3);
6952 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
6953 addptr(to, 64);
6954 subptr(count, 16 << shift);
6955 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
6956 jmpb(L_check_fill_32_bytes);
6957
6958 BIND(L_check_fill_64_bytes_avx2);
6959 }
6960 // Fill 64-byte chunks
6961 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
6962
6963 subptr(count, 16 << shift);
6964 jcc(Assembler::less, L_check_fill_32_bytes);
6965
6966 // align data for 64-byte chunks
6967 Label L_fill_64_bytes_loop, L_align_64_bytes_loop;
6968 if (EnableX86ECoreOpts) {
6969 // align 'big' arrays to cache lines to minimize split_stores
6970 cmpptr(count, 96 << shift);
6971 jcc(Assembler::below, L_fill_64_bytes_loop);
6972
6973 // Find the bytes needed for alignment
6974 movptr(rtmp, to);
6975 andptr(rtmp, 0x1c);
6976 jcc(Assembler::zero, L_fill_64_bytes_loop);
6977 negptr(rtmp); // number of bytes to fill 32-rtmp. it filled by 2 mov by 32
6978 addptr(rtmp, 32);
6979 shrptr(rtmp, 2 - shift);// get number of elements from bytes
6980 subptr(count, rtmp); // adjust count by number of elements
6981
6982 align(16);
6983 BIND(L_align_64_bytes_loop);
6984 movdl(Address(to, 0), xtmp);
6985 addptr(to, 4);
6986 subptr(rtmp, 1 << shift);
6987 jcc(Assembler::greater, L_align_64_bytes_loop);
6988 }
6989
6990 align(16);
6991 BIND(L_fill_64_bytes_loop);
6992 vmovdqu(Address(to, 0), xtmp);
6993 vmovdqu(Address(to, 32), xtmp);
6994 addptr(to, 64);
6995 subptr(count, 16 << shift);
6996 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6997
6998 align(16);
6999 BIND(L_check_fill_32_bytes);
7000 addptr(count, 8 << shift);
7001 jccb(Assembler::less, L_check_fill_8_bytes);
7002 vmovdqu(Address(to, 0), xtmp);
7003 addptr(to, 32);
7004 subptr(count, 8 << shift);
7005
7006 BIND(L_check_fill_8_bytes);
7007 // clean upper bits of YMM registers
7008 movdl(xtmp, value);
7009 pshufd(xtmp, xtmp, 0);
7010 } else {
7011 // Fill 32-byte chunks
7012 pshufd(xtmp, xtmp, 0);
7013
7014 subptr(count, 8 << shift);
7015 jcc(Assembler::less, L_check_fill_8_bytes);
7016 align(16);
7017
7018 BIND(L_fill_32_bytes_loop);
7019
7020 if (UseUnalignedLoadStores) {
7021 movdqu(Address(to, 0), xtmp);
7022 movdqu(Address(to, 16), xtmp);
7023 } else {
7024 movq(Address(to, 0), xtmp);
7025 movq(Address(to, 8), xtmp);
7026 movq(Address(to, 16), xtmp);
7027 movq(Address(to, 24), xtmp);
7028 }
7029
7030 addptr(to, 32);
7031 subptr(count, 8 << shift);
7032 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7033
7034 BIND(L_check_fill_8_bytes);
7035 }
7036 addptr(count, 8 << shift);
7037 jccb(Assembler::zero, L_exit);
7038 jmpb(L_fill_8_bytes);
7039
7040 //
7041 // length is too short, just fill qwords
7042 //
7043 align(16);
7044 BIND(L_fill_8_bytes_loop);
7045 movq(Address(to, 0), xtmp);
7046 addptr(to, 8);
7047 BIND(L_fill_8_bytes);
7048 subptr(count, 1 << (shift + 1));
7049 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7050 }
7051 }
7052
7053 Label L_fill_4_bytes_loop;
7054 testl(count, 1 << shift);
7055 jccb(Assembler::zero, L_fill_2_bytes);
7056
7057 align(16);
7058 BIND(L_fill_4_bytes_loop);
7059 movl(Address(to, 0), value);
7060 addptr(to, 4);
7061
7062 BIND(L_fill_4_bytes);
7063 subptr(count, 1 << shift);
7064 jccb(Assembler::greaterEqual, L_fill_4_bytes_loop);
7065
7066 if (t == T_BYTE || t == T_SHORT) {
7067 Label L_fill_byte;
7068 BIND(L_fill_2_bytes);
7069 // fill trailing 2 bytes
7070 testl(count, 1<<(shift-1));
7071 jccb(Assembler::zero, L_fill_byte);
7072 movw(Address(to, 0), value);
7073 if (t == T_BYTE) {
7074 addptr(to, 2);
7075 BIND(L_fill_byte);
7076 // fill trailing byte
7077 testl(count, 1);
7078 jccb(Assembler::zero, L_exit);
7079 movb(Address(to, 0), value);
7080 } else {
7081 BIND(L_fill_byte);
7082 }
7083 } else {
7084 BIND(L_fill_2_bytes);
7085 }
7086 BIND(L_exit);
7087 }
7088
7089 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
7090 switch(type) {
7091 case T_BYTE:
7092 case T_BOOLEAN:
7093 evpbroadcastb(dst, src, vector_len);
7094 break;
7095 case T_SHORT:
7096 case T_CHAR:
7097 evpbroadcastw(dst, src, vector_len);
7098 break;
7099 case T_INT:
7100 case T_FLOAT:
7101 evpbroadcastd(dst, src, vector_len);
7102 break;
7103 case T_LONG:
7104 case T_DOUBLE:
7105 evpbroadcastq(dst, src, vector_len);
7106 break;
7107 default:
7108 fatal("Unhandled type : %s", type2name(type));
7109 break;
7110 }
7111 }
7112
7113 // Encode given char[]/byte[] to byte[] in ISO_8859_1 or ASCII
7114 //
7115 // @IntrinsicCandidate
7116 // int sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(
7117 // char[] sa, int sp, byte[] da, int dp, int len) {
7118 // int i = 0;
7119 // for (; i < len; i++) {
7120 // char c = sa[sp++];
7121 // if (c > '\u00FF')
7122 // break;
7123 // da[dp++] = (byte) c;
7124 // }
7125 // return i;
7126 // }
7127 //
7128 // @IntrinsicCandidate
7129 // int java.lang.StringCoding.encodeISOArray0(
7130 // byte[] sa, int sp, byte[] da, int dp, int len) {
7131 // int i = 0;
7132 // for (; i < len; i++) {
7133 // char c = StringUTF16.getChar(sa, sp++);
7134 // if (c > '\u00FF')
7135 // break;
7136 // da[dp++] = (byte) c;
7137 // }
7138 // return i;
7139 // }
7140 //
7141 // @IntrinsicCandidate
7142 // int java.lang.StringCoding.encodeAsciiArray0(
7143 // char[] sa, int sp, byte[] da, int dp, int len) {
7144 // int i = 0;
7145 // for (; i < len; i++) {
7146 // char c = sa[sp++];
7147 // if (c >= '\u0080')
7148 // break;
7149 // da[dp++] = (byte) c;
7150 // }
7151 // return i;
7152 // }
7153 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7154 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7155 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7156 Register tmp5, Register result, bool ascii) {
7157
7158 // rsi: src
7159 // rdi: dst
7160 // rdx: len
7161 // rcx: tmp5
7162 // rax: result
7163 ShortBranchVerifier sbv(this);
7164 assert_different_registers(src, dst, len, tmp5, result);
7165 Label L_done, L_copy_1_char, L_copy_1_char_exit;
7166
7167 int mask = ascii ? 0xff80ff80 : 0xff00ff00;
7168 int short_mask = ascii ? 0xff80 : 0xff00;
7169
7170 // set result
7171 xorl(result, result);
7172 // check for zero length
7173 testl(len, len);
7174 jcc(Assembler::zero, L_done);
7175
7176 movl(result, len);
7177
7178 // Setup pointers
7179 lea(src, Address(src, len, Address::times_2)); // char[]
7180 lea(dst, Address(dst, len, Address::times_1)); // byte[]
7181 negptr(len);
7182
7183 if (UseSSE42Intrinsics || UseAVX >= 2) {
7184 Label L_copy_8_chars, L_copy_8_chars_exit;
7185 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7186
7187 if (UseAVX >= 2) {
7188 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7189 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
7190 movdl(tmp1Reg, tmp5);
7191 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
7192 jmp(L_chars_32_check);
7193
7194 bind(L_copy_32_chars);
7195 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7196 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7197 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7198 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
7199 jccb(Assembler::notZero, L_copy_32_chars_exit);
7200 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7201 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7202 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7203
7204 bind(L_chars_32_check);
7205 addptr(len, 32);
7206 jcc(Assembler::lessEqual, L_copy_32_chars);
7207
7208 bind(L_copy_32_chars_exit);
7209 subptr(len, 16);
7210 jccb(Assembler::greater, L_copy_16_chars_exit);
7211
7212 } else if (UseSSE42Intrinsics) {
7213 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
7214 movdl(tmp1Reg, tmp5);
7215 pshufd(tmp1Reg, tmp1Reg, 0);
7216 jmpb(L_chars_16_check);
7217 }
7218
7219 bind(L_copy_16_chars);
7220 if (UseAVX >= 2) {
7221 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7222 vptest(tmp2Reg, tmp1Reg);
7223 jcc(Assembler::notZero, L_copy_16_chars_exit);
7224 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7225 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7226 } else {
7227 if (UseAVX > 0) {
7228 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7229 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7230 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7231 } else {
7232 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7233 por(tmp2Reg, tmp3Reg);
7234 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7235 por(tmp2Reg, tmp4Reg);
7236 }
7237 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
7238 jccb(Assembler::notZero, L_copy_16_chars_exit);
7239 packuswb(tmp3Reg, tmp4Reg);
7240 }
7241 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7242
7243 bind(L_chars_16_check);
7244 addptr(len, 16);
7245 jcc(Assembler::lessEqual, L_copy_16_chars);
7246
7247 bind(L_copy_16_chars_exit);
7248 if (UseAVX >= 2) {
7249 // clean upper bits of YMM registers
7250 vpxor(tmp2Reg, tmp2Reg);
7251 vpxor(tmp3Reg, tmp3Reg);
7252 vpxor(tmp4Reg, tmp4Reg);
7253 movdl(tmp1Reg, tmp5);
7254 pshufd(tmp1Reg, tmp1Reg, 0);
7255 }
7256 subptr(len, 8);
7257 jccb(Assembler::greater, L_copy_8_chars_exit);
7258
7259 bind(L_copy_8_chars);
7260 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7261 ptest(tmp3Reg, tmp1Reg);
7262 jccb(Assembler::notZero, L_copy_8_chars_exit);
7263 packuswb(tmp3Reg, tmp1Reg);
7264 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7265 addptr(len, 8);
7266 jccb(Assembler::lessEqual, L_copy_8_chars);
7267
7268 bind(L_copy_8_chars_exit);
7269 subptr(len, 8);
7270 jccb(Assembler::zero, L_done);
7271 }
7272
7273 bind(L_copy_1_char);
7274 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7275 testl(tmp5, short_mask); // check if Unicode or non-ASCII char
7276 jccb(Assembler::notZero, L_copy_1_char_exit);
7277 movb(Address(dst, len, Address::times_1, 0), tmp5);
7278 addptr(len, 1);
7279 jccb(Assembler::less, L_copy_1_char);
7280
7281 bind(L_copy_1_char_exit);
7282 addptr(result, len); // len is negative count of not processed elements
7283
7284 bind(L_done);
7285 }
7286
7287 /**
7288 * Helper for multiply_to_len().
7289 */
7290 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7291 addq(dest_lo, src1);
7292 adcq(dest_hi, 0);
7293 addq(dest_lo, src2);
7294 adcq(dest_hi, 0);
7295 }
7296
7297 /**
7298 * Multiply 64 bit by 64 bit first loop.
7299 */
7300 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7301 Register y, Register y_idx, Register z,
7302 Register carry, Register product,
7303 Register idx, Register kdx) {
7304 //
7305 // jlong carry, x[], y[], z[];
7306 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7307 // huge_128 product = y[idx] * x[xstart] + carry;
7308 // z[kdx] = (jlong)product;
7309 // carry = (jlong)(product >>> 64);
7310 // }
7311 // z[xstart] = carry;
7312 //
7313
7314 Label L_first_loop, L_first_loop_exit;
7315 Label L_one_x, L_one_y, L_multiply;
7316
7317 decrementl(xstart);
7318 jcc(Assembler::negative, L_one_x);
7319
7320 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7321 rorq(x_xstart, 32); // convert big-endian to little-endian
7322
7323 bind(L_first_loop);
7324 decrementl(idx);
7325 jcc(Assembler::negative, L_first_loop_exit);
7326 decrementl(idx);
7327 jcc(Assembler::negative, L_one_y);
7328 movq(y_idx, Address(y, idx, Address::times_4, 0));
7329 rorq(y_idx, 32); // convert big-endian to little-endian
7330 bind(L_multiply);
7331 movq(product, x_xstart);
7332 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7333 addq(product, carry);
7334 adcq(rdx, 0);
7335 subl(kdx, 2);
7336 movl(Address(z, kdx, Address::times_4, 4), product);
7337 shrq(product, 32);
7338 movl(Address(z, kdx, Address::times_4, 0), product);
7339 movq(carry, rdx);
7340 jmp(L_first_loop);
7341
7342 bind(L_one_y);
7343 movl(y_idx, Address(y, 0));
7344 jmp(L_multiply);
7345
7346 bind(L_one_x);
7347 movl(x_xstart, Address(x, 0));
7348 jmp(L_first_loop);
7349
7350 bind(L_first_loop_exit);
7351 }
7352
7353 /**
7354 * Multiply 64 bit by 64 bit and add 128 bit.
7355 */
7356 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7357 Register yz_idx, Register idx,
7358 Register carry, Register product, int offset) {
7359 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7360 // z[kdx] = (jlong)product;
7361
7362 movq(yz_idx, Address(y, idx, Address::times_4, offset));
7363 rorq(yz_idx, 32); // convert big-endian to little-endian
7364 movq(product, x_xstart);
7365 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7366 movq(yz_idx, Address(z, idx, Address::times_4, offset));
7367 rorq(yz_idx, 32); // convert big-endian to little-endian
7368
7369 add2_with_carry(rdx, product, carry, yz_idx);
7370
7371 movl(Address(z, idx, Address::times_4, offset+4), product);
7372 shrq(product, 32);
7373 movl(Address(z, idx, Address::times_4, offset), product);
7374
7375 }
7376
7377 /**
7378 * Multiply 128 bit by 128 bit. Unrolled inner loop.
7379 */
7380 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7381 Register yz_idx, Register idx, Register jdx,
7382 Register carry, Register product,
7383 Register carry2) {
7384 // jlong carry, x[], y[], z[];
7385 // int kdx = ystart+1;
7386 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7387 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7388 // z[kdx+idx+1] = (jlong)product;
7389 // jlong carry2 = (jlong)(product >>> 64);
7390 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7391 // z[kdx+idx] = (jlong)product;
7392 // carry = (jlong)(product >>> 64);
7393 // }
7394 // idx += 2;
7395 // if (idx > 0) {
7396 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7397 // z[kdx+idx] = (jlong)product;
7398 // carry = (jlong)(product >>> 64);
7399 // }
7400 //
7401
7402 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7403
7404 movl(jdx, idx);
7405 andl(jdx, 0xFFFFFFFC);
7406 shrl(jdx, 2);
7407
7408 bind(L_third_loop);
7409 subl(jdx, 1);
7410 jcc(Assembler::negative, L_third_loop_exit);
7411 subl(idx, 4);
7412
7413 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7414 movq(carry2, rdx);
7415
7416 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7417 movq(carry, rdx);
7418 jmp(L_third_loop);
7419
7420 bind (L_third_loop_exit);
7421
7422 andl (idx, 0x3);
7423 jcc(Assembler::zero, L_post_third_loop_done);
7424
7425 Label L_check_1;
7426 subl(idx, 2);
7427 jcc(Assembler::negative, L_check_1);
7428
7429 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7430 movq(carry, rdx);
7431
7432 bind (L_check_1);
7433 addl (idx, 0x2);
7434 andl (idx, 0x1);
7435 subl(idx, 1);
7436 jcc(Assembler::negative, L_post_third_loop_done);
7437
7438 movl(yz_idx, Address(y, idx, Address::times_4, 0));
7439 movq(product, x_xstart);
7440 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7441 movl(yz_idx, Address(z, idx, Address::times_4, 0));
7442
7443 add2_with_carry(rdx, product, yz_idx, carry);
7444
7445 movl(Address(z, idx, Address::times_4, 0), product);
7446 shrq(product, 32);
7447
7448 shlq(rdx, 32);
7449 orq(product, rdx);
7450 movq(carry, product);
7451
7452 bind(L_post_third_loop_done);
7453 }
7454
7455 /**
7456 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7457 *
7458 */
7459 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7460 Register carry, Register carry2,
7461 Register idx, Register jdx,
7462 Register yz_idx1, Register yz_idx2,
7463 Register tmp, Register tmp3, Register tmp4) {
7464 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7465
7466 // jlong carry, x[], y[], z[];
7467 // int kdx = ystart+1;
7468 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7469 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7470 // jlong carry2 = (jlong)(tmp3 >>> 64);
7471 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
7472 // carry = (jlong)(tmp4 >>> 64);
7473 // z[kdx+idx+1] = (jlong)tmp3;
7474 // z[kdx+idx] = (jlong)tmp4;
7475 // }
7476 // idx += 2;
7477 // if (idx > 0) {
7478 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7479 // z[kdx+idx] = (jlong)yz_idx1;
7480 // carry = (jlong)(yz_idx1 >>> 64);
7481 // }
7482 //
7483
7484 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7485
7486 movl(jdx, idx);
7487 andl(jdx, 0xFFFFFFFC);
7488 shrl(jdx, 2);
7489
7490 bind(L_third_loop);
7491 subl(jdx, 1);
7492 jcc(Assembler::negative, L_third_loop_exit);
7493 subl(idx, 4);
7494
7495 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
7496 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7497 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
7498 rorxq(yz_idx2, yz_idx2, 32);
7499
7500 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7501 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
7502
7503 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
7504 rorxq(yz_idx1, yz_idx1, 32);
7505 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7506 rorxq(yz_idx2, yz_idx2, 32);
7507
7508 if (VM_Version::supports_adx()) {
7509 adcxq(tmp3, carry);
7510 adoxq(tmp3, yz_idx1);
7511
7512 adcxq(tmp4, tmp);
7513 adoxq(tmp4, yz_idx2);
7514
7515 movl(carry, 0); // does not affect flags
7516 adcxq(carry2, carry);
7517 adoxq(carry2, carry);
7518 } else {
7519 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7520 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7521 }
7522 movq(carry, carry2);
7523
7524 movl(Address(z, idx, Address::times_4, 12), tmp3);
7525 shrq(tmp3, 32);
7526 movl(Address(z, idx, Address::times_4, 8), tmp3);
7527
7528 movl(Address(z, idx, Address::times_4, 4), tmp4);
7529 shrq(tmp4, 32);
7530 movl(Address(z, idx, Address::times_4, 0), tmp4);
7531
7532 jmp(L_third_loop);
7533
7534 bind (L_third_loop_exit);
7535
7536 andl (idx, 0x3);
7537 jcc(Assembler::zero, L_post_third_loop_done);
7538
7539 Label L_check_1;
7540 subl(idx, 2);
7541 jcc(Assembler::negative, L_check_1);
7542
7543 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
7544 rorxq(yz_idx1, yz_idx1, 32);
7545 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7546 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7547 rorxq(yz_idx2, yz_idx2, 32);
7548
7549 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7550
7551 movl(Address(z, idx, Address::times_4, 4), tmp3);
7552 shrq(tmp3, 32);
7553 movl(Address(z, idx, Address::times_4, 0), tmp3);
7554 movq(carry, tmp4);
7555
7556 bind (L_check_1);
7557 addl (idx, 0x2);
7558 andl (idx, 0x1);
7559 subl(idx, 1);
7560 jcc(Assembler::negative, L_post_third_loop_done);
7561 movl(tmp4, Address(y, idx, Address::times_4, 0));
7562 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
7563 movl(tmp4, Address(z, idx, Address::times_4, 0));
7564
7565 add2_with_carry(carry2, tmp3, tmp4, carry);
7566
7567 movl(Address(z, idx, Address::times_4, 0), tmp3);
7568 shrq(tmp3, 32);
7569
7570 shlq(carry2, 32);
7571 orq(tmp3, carry2);
7572 movq(carry, tmp3);
7573
7574 bind(L_post_third_loop_done);
7575 }
7576
7577 /**
7578 * Code for BigInteger::multiplyToLen() intrinsic.
7579 *
7580 * rdi: x
7581 * rax: xlen
7582 * rsi: y
7583 * rcx: ylen
7584 * r8: z
7585 * r11: tmp0
7586 * r12: tmp1
7587 * r13: tmp2
7588 * r14: tmp3
7589 * r15: tmp4
7590 * rbx: tmp5
7591 *
7592 */
7593 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
7594 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7595 ShortBranchVerifier sbv(this);
7596 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7597
7598 push(tmp0);
7599 push(tmp1);
7600 push(tmp2);
7601 push(tmp3);
7602 push(tmp4);
7603 push(tmp5);
7604
7605 push(xlen);
7606
7607 const Register idx = tmp1;
7608 const Register kdx = tmp2;
7609 const Register xstart = tmp3;
7610
7611 const Register y_idx = tmp4;
7612 const Register carry = tmp5;
7613 const Register product = xlen;
7614 const Register x_xstart = tmp0;
7615
7616 // First Loop.
7617 //
7618 // final static long LONG_MASK = 0xffffffffL;
7619 // int xstart = xlen - 1;
7620 // int ystart = ylen - 1;
7621 // long carry = 0;
7622 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7623 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7624 // z[kdx] = (int)product;
7625 // carry = product >>> 32;
7626 // }
7627 // z[xstart] = (int)carry;
7628 //
7629
7630 movl(idx, ylen); // idx = ylen;
7631 lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen;
7632 xorq(carry, carry); // carry = 0;
7633
7634 Label L_done;
7635
7636 movl(xstart, xlen);
7637 decrementl(xstart);
7638 jcc(Assembler::negative, L_done);
7639
7640 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7641
7642 Label L_second_loop;
7643 testl(kdx, kdx);
7644 jcc(Assembler::zero, L_second_loop);
7645
7646 Label L_carry;
7647 subl(kdx, 1);
7648 jcc(Assembler::zero, L_carry);
7649
7650 movl(Address(z, kdx, Address::times_4, 0), carry);
7651 shrq(carry, 32);
7652 subl(kdx, 1);
7653
7654 bind(L_carry);
7655 movl(Address(z, kdx, Address::times_4, 0), carry);
7656
7657 // Second and third (nested) loops.
7658 //
7659 // for (int i = xstart-1; i >= 0; i--) { // Second loop
7660 // carry = 0;
7661 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7662 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7663 // (z[k] & LONG_MASK) + carry;
7664 // z[k] = (int)product;
7665 // carry = product >>> 32;
7666 // }
7667 // z[i] = (int)carry;
7668 // }
7669 //
7670 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7671
7672 const Register jdx = tmp1;
7673
7674 bind(L_second_loop);
7675 xorl(carry, carry); // carry = 0;
7676 movl(jdx, ylen); // j = ystart+1
7677
7678 subl(xstart, 1); // i = xstart-1;
7679 jcc(Assembler::negative, L_done);
7680
7681 push (z);
7682
7683 Label L_last_x;
7684 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7685 subl(xstart, 1); // i = xstart-1;
7686 jcc(Assembler::negative, L_last_x);
7687
7688 if (UseBMI2Instructions) {
7689 movq(rdx, Address(x, xstart, Address::times_4, 0));
7690 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7691 } else {
7692 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7693 rorq(x_xstart, 32); // convert big-endian to little-endian
7694 }
7695
7696 Label L_third_loop_prologue;
7697 bind(L_third_loop_prologue);
7698
7699 push (x);
7700 push (xstart);
7701 push (ylen);
7702
7703
7704 if (UseBMI2Instructions) {
7705 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7706 } else { // !UseBMI2Instructions
7707 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7708 }
7709
7710 pop(ylen);
7711 pop(xlen);
7712 pop(x);
7713 pop(z);
7714
7715 movl(tmp3, xlen);
7716 addl(tmp3, 1);
7717 movl(Address(z, tmp3, Address::times_4, 0), carry);
7718 subl(tmp3, 1);
7719 jccb(Assembler::negative, L_done);
7720
7721 shrq(carry, 32);
7722 movl(Address(z, tmp3, Address::times_4, 0), carry);
7723 jmp(L_second_loop);
7724
7725 // Next infrequent code is moved outside loops.
7726 bind(L_last_x);
7727 if (UseBMI2Instructions) {
7728 movl(rdx, Address(x, 0));
7729 } else {
7730 movl(x_xstart, Address(x, 0));
7731 }
7732 jmp(L_third_loop_prologue);
7733
7734 bind(L_done);
7735
7736 pop(xlen);
7737
7738 pop(tmp5);
7739 pop(tmp4);
7740 pop(tmp3);
7741 pop(tmp2);
7742 pop(tmp1);
7743 pop(tmp0);
7744 }
7745
7746 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
7747 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
7748 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
7749 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
7750 Label VECTOR8_TAIL, VECTOR4_TAIL;
7751 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
7752 Label SAME_TILL_END, DONE;
7753 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
7754
7755 //scale is in rcx in both Win64 and Unix
7756 ShortBranchVerifier sbv(this);
7757
7758 shlq(length);
7759 xorq(result, result);
7760
7761 if ((AVX3Threshold == 0) && (UseAVX > 2) &&
7762 VM_Version::supports_avx512vlbw()) {
7763 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
7764
7765 cmpq(length, 64);
7766 jcc(Assembler::less, VECTOR32_TAIL);
7767
7768 movq(tmp1, length);
7769 andq(tmp1, 0x3F); // tail count
7770 andq(length, ~(0x3F)); //vector count
7771
7772 bind(VECTOR64_LOOP);
7773 // AVX512 code to compare 64 byte vectors.
7774 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
7775 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
7776 kortestql(k7, k7);
7777 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
7778 addq(result, 64);
7779 subq(length, 64);
7780 jccb(Assembler::notZero, VECTOR64_LOOP);
7781
7782 //bind(VECTOR64_TAIL);
7783 testq(tmp1, tmp1);
7784 jcc(Assembler::zero, SAME_TILL_END);
7785
7786 //bind(VECTOR64_TAIL);
7787 // AVX512 code to compare up to 63 byte vectors.
7788 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
7789 shlxq(tmp2, tmp2, tmp1);
7790 notq(tmp2);
7791 kmovql(k3, tmp2);
7792
7793 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
7794 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
7795
7796 ktestql(k7, k3);
7797 jcc(Assembler::below, SAME_TILL_END); // not mismatch
7798
7799 bind(VECTOR64_NOT_EQUAL);
7800 kmovql(tmp1, k7);
7801 notq(tmp1);
7802 tzcntq(tmp1, tmp1);
7803 addq(result, tmp1);
7804 shrq(result);
7805 jmp(DONE);
7806 bind(VECTOR32_TAIL);
7807 }
7808
7809 cmpq(length, 8);
7810 jcc(Assembler::equal, VECTOR8_LOOP);
7811 jcc(Assembler::less, VECTOR4_TAIL);
7812
7813 if (UseAVX >= 2) {
7814 Label VECTOR16_TAIL, VECTOR32_LOOP;
7815
7816 cmpq(length, 16);
7817 jcc(Assembler::equal, VECTOR16_LOOP);
7818 jcc(Assembler::less, VECTOR8_LOOP);
7819
7820 cmpq(length, 32);
7821 jccb(Assembler::less, VECTOR16_TAIL);
7822
7823 subq(length, 32);
7824 bind(VECTOR32_LOOP);
7825 vmovdqu(rymm0, Address(obja, result));
7826 vmovdqu(rymm1, Address(objb, result));
7827 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
7828 vptest(rymm2, rymm2);
7829 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
7830 addq(result, 32);
7831 subq(length, 32);
7832 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
7833 addq(length, 32);
7834 jcc(Assembler::equal, SAME_TILL_END);
7835 //falling through if less than 32 bytes left //close the branch here.
7836
7837 bind(VECTOR16_TAIL);
7838 cmpq(length, 16);
7839 jccb(Assembler::less, VECTOR8_TAIL);
7840 bind(VECTOR16_LOOP);
7841 movdqu(rymm0, Address(obja, result));
7842 movdqu(rymm1, Address(objb, result));
7843 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
7844 ptest(rymm2, rymm2);
7845 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7846 addq(result, 16);
7847 subq(length, 16);
7848 jcc(Assembler::equal, SAME_TILL_END);
7849 //falling through if less than 16 bytes left
7850 } else {//regular intrinsics
7851
7852 cmpq(length, 16);
7853 jccb(Assembler::less, VECTOR8_TAIL);
7854
7855 subq(length, 16);
7856 bind(VECTOR16_LOOP);
7857 movdqu(rymm0, Address(obja, result));
7858 movdqu(rymm1, Address(objb, result));
7859 pxor(rymm0, rymm1);
7860 ptest(rymm0, rymm0);
7861 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7862 addq(result, 16);
7863 subq(length, 16);
7864 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
7865 addq(length, 16);
7866 jcc(Assembler::equal, SAME_TILL_END);
7867 //falling through if less than 16 bytes left
7868 }
7869
7870 bind(VECTOR8_TAIL);
7871 cmpq(length, 8);
7872 jccb(Assembler::less, VECTOR4_TAIL);
7873 bind(VECTOR8_LOOP);
7874 movq(tmp1, Address(obja, result));
7875 movq(tmp2, Address(objb, result));
7876 xorq(tmp1, tmp2);
7877 testq(tmp1, tmp1);
7878 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
7879 addq(result, 8);
7880 subq(length, 8);
7881 jcc(Assembler::equal, SAME_TILL_END);
7882 //falling through if less than 8 bytes left
7883
7884 bind(VECTOR4_TAIL);
7885 cmpq(length, 4);
7886 jccb(Assembler::less, BYTES_TAIL);
7887 bind(VECTOR4_LOOP);
7888 movl(tmp1, Address(obja, result));
7889 xorl(tmp1, Address(objb, result));
7890 testl(tmp1, tmp1);
7891 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
7892 addq(result, 4);
7893 subq(length, 4);
7894 jcc(Assembler::equal, SAME_TILL_END);
7895 //falling through if less than 4 bytes left
7896
7897 bind(BYTES_TAIL);
7898 bind(BYTES_LOOP);
7899 load_unsigned_byte(tmp1, Address(obja, result));
7900 load_unsigned_byte(tmp2, Address(objb, result));
7901 xorl(tmp1, tmp2);
7902 testl(tmp1, tmp1);
7903 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7904 decq(length);
7905 jcc(Assembler::zero, SAME_TILL_END);
7906 incq(result);
7907 load_unsigned_byte(tmp1, Address(obja, result));
7908 load_unsigned_byte(tmp2, Address(objb, result));
7909 xorl(tmp1, tmp2);
7910 testl(tmp1, tmp1);
7911 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7912 decq(length);
7913 jcc(Assembler::zero, SAME_TILL_END);
7914 incq(result);
7915 load_unsigned_byte(tmp1, Address(obja, result));
7916 load_unsigned_byte(tmp2, Address(objb, result));
7917 xorl(tmp1, tmp2);
7918 testl(tmp1, tmp1);
7919 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7920 jmp(SAME_TILL_END);
7921
7922 if (UseAVX >= 2) {
7923 bind(VECTOR32_NOT_EQUAL);
7924 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
7925 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
7926 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
7927 vpmovmskb(tmp1, rymm0);
7928 bsfq(tmp1, tmp1);
7929 addq(result, tmp1);
7930 shrq(result);
7931 jmp(DONE);
7932 }
7933
7934 bind(VECTOR16_NOT_EQUAL);
7935 if (UseAVX >= 2) {
7936 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
7937 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
7938 pxor(rymm0, rymm2);
7939 } else {
7940 pcmpeqb(rymm2, rymm2);
7941 pxor(rymm0, rymm1);
7942 pcmpeqb(rymm0, rymm1);
7943 pxor(rymm0, rymm2);
7944 }
7945 pmovmskb(tmp1, rymm0);
7946 bsfq(tmp1, tmp1);
7947 addq(result, tmp1);
7948 shrq(result);
7949 jmpb(DONE);
7950
7951 bind(VECTOR8_NOT_EQUAL);
7952 bind(VECTOR4_NOT_EQUAL);
7953 bsfq(tmp1, tmp1);
7954 shrq(tmp1, 3);
7955 addq(result, tmp1);
7956 bind(BYTES_NOT_EQUAL);
7957 shrq(result);
7958 jmpb(DONE);
7959
7960 bind(SAME_TILL_END);
7961 mov64(result, -1);
7962
7963 bind(DONE);
7964 }
7965
7966 //Helper functions for square_to_len()
7967
7968 /**
7969 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7970 * Preserves x and z and modifies rest of the registers.
7971 */
7972 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7973 // Perform square and right shift by 1
7974 // Handle odd xlen case first, then for even xlen do the following
7975 // jlong carry = 0;
7976 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7977 // huge_128 product = x[j:j+1] * x[j:j+1];
7978 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7979 // z[i+2:i+3] = (jlong)(product >>> 1);
7980 // carry = (jlong)product;
7981 // }
7982
7983 xorq(tmp5, tmp5); // carry
7984 xorq(rdxReg, rdxReg);
7985 xorl(tmp1, tmp1); // index for x
7986 xorl(tmp4, tmp4); // index for z
7987
7988 Label L_first_loop, L_first_loop_exit;
7989
7990 testl(xlen, 1);
7991 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7992
7993 // Square and right shift by 1 the odd element using 32 bit multiply
7994 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7995 imulq(raxReg, raxReg);
7996 shrq(raxReg, 1);
7997 adcq(tmp5, 0);
7998 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7999 incrementl(tmp1);
8000 addl(tmp4, 2);
8001
8002 // Square and right shift by 1 the rest using 64 bit multiply
8003 bind(L_first_loop);
8004 cmpptr(tmp1, xlen);
8005 jccb(Assembler::equal, L_first_loop_exit);
8006
8007 // Square
8008 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
8009 rorq(raxReg, 32); // convert big-endian to little-endian
8010 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
8011
8012 // Right shift by 1 and save carry
8013 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
8014 rcrq(rdxReg, 1);
8015 rcrq(raxReg, 1);
8016 adcq(tmp5, 0);
8017
8018 // Store result in z
8019 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
8020 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
8021
8022 // Update indices for x and z
8023 addl(tmp1, 2);
8024 addl(tmp4, 4);
8025 jmp(L_first_loop);
8026
8027 bind(L_first_loop_exit);
8028 }
8029
8030
8031 /**
8032 * Perform the following multiply add operation using BMI2 instructions
8033 * carry:sum = sum + op1*op2 + carry
8034 * op2 should be in rdx
8035 * op2 is preserved, all other registers are modified
8036 */
8037 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
8038 // assert op2 is rdx
8039 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
8040 addq(sum, carry);
8041 adcq(tmp2, 0);
8042 addq(sum, op1);
8043 adcq(tmp2, 0);
8044 movq(carry, tmp2);
8045 }
8046
8047 /**
8048 * Perform the following multiply add operation:
8049 * carry:sum = sum + op1*op2 + carry
8050 * Preserves op1, op2 and modifies rest of registers
8051 */
8052 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
8053 // rdx:rax = op1 * op2
8054 movq(raxReg, op2);
8055 mulq(op1);
8056
8057 // rdx:rax = sum + carry + rdx:rax
8058 addq(sum, carry);
8059 adcq(rdxReg, 0);
8060 addq(sum, raxReg);
8061 adcq(rdxReg, 0);
8062
8063 // carry:sum = rdx:sum
8064 movq(carry, rdxReg);
8065 }
8066
8067 /**
8068 * Add 64 bit long carry into z[] with carry propagation.
8069 * Preserves z and carry register values and modifies rest of registers.
8070 *
8071 */
8072 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
8073 Label L_fourth_loop, L_fourth_loop_exit;
8074
8075 movl(tmp1, 1);
8076 subl(zlen, 2);
8077 addq(Address(z, zlen, Address::times_4, 0), carry);
8078
8079 bind(L_fourth_loop);
8080 jccb(Assembler::carryClear, L_fourth_loop_exit);
8081 subl(zlen, 2);
8082 jccb(Assembler::negative, L_fourth_loop_exit);
8083 addq(Address(z, zlen, Address::times_4, 0), tmp1);
8084 jmp(L_fourth_loop);
8085 bind(L_fourth_loop_exit);
8086 }
8087
8088 /**
8089 * Shift z[] left by 1 bit.
8090 * Preserves x, len, z and zlen registers and modifies rest of the registers.
8091 *
8092 */
8093 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
8094
8095 Label L_fifth_loop, L_fifth_loop_exit;
8096
8097 // Fifth loop
8098 // Perform primitiveLeftShift(z, zlen, 1)
8099
8100 const Register prev_carry = tmp1;
8101 const Register new_carry = tmp4;
8102 const Register value = tmp2;
8103 const Register zidx = tmp3;
8104
8105 // int zidx, carry;
8106 // long value;
8107 // carry = 0;
8108 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
8109 // (carry:value) = (z[i] << 1) | carry ;
8110 // z[i] = value;
8111 // }
8112
8113 movl(zidx, zlen);
8114 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
8115
8116 bind(L_fifth_loop);
8117 decl(zidx); // Use decl to preserve carry flag
8118 decl(zidx);
8119 jccb(Assembler::negative, L_fifth_loop_exit);
8120
8121 if (UseBMI2Instructions) {
8122 movq(value, Address(z, zidx, Address::times_4, 0));
8123 rclq(value, 1);
8124 rorxq(value, value, 32);
8125 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
8126 }
8127 else {
8128 // clear new_carry
8129 xorl(new_carry, new_carry);
8130
8131 // Shift z[i] by 1, or in previous carry and save new carry
8132 movq(value, Address(z, zidx, Address::times_4, 0));
8133 shlq(value, 1);
8134 adcl(new_carry, 0);
8135
8136 orq(value, prev_carry);
8137 rorq(value, 0x20);
8138 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
8139
8140 // Set previous carry = new carry
8141 movl(prev_carry, new_carry);
8142 }
8143 jmp(L_fifth_loop);
8144
8145 bind(L_fifth_loop_exit);
8146 }
8147
8148
8149 /**
8150 * Code for BigInteger::squareToLen() intrinsic
8151 *
8152 * rdi: x
8153 * rsi: len
8154 * r8: z
8155 * rcx: zlen
8156 * r12: tmp1
8157 * r13: tmp2
8158 * r14: tmp3
8159 * r15: tmp4
8160 * rbx: tmp5
8161 *
8162 */
8163 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8164
8165 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
8166 push(tmp1);
8167 push(tmp2);
8168 push(tmp3);
8169 push(tmp4);
8170 push(tmp5);
8171
8172 // First loop
8173 // Store the squares, right shifted one bit (i.e., divided by 2).
8174 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
8175
8176 // Add in off-diagonal sums.
8177 //
8178 // Second, third (nested) and fourth loops.
8179 // zlen +=2;
8180 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
8181 // carry = 0;
8182 // long op2 = x[xidx:xidx+1];
8183 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
8184 // k -= 2;
8185 // long op1 = x[j:j+1];
8186 // long sum = z[k:k+1];
8187 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
8188 // z[k:k+1] = sum;
8189 // }
8190 // add_one_64(z, k, carry, tmp_regs);
8191 // }
8192
8193 const Register carry = tmp5;
8194 const Register sum = tmp3;
8195 const Register op1 = tmp4;
8196 Register op2 = tmp2;
8197
8198 push(zlen);
8199 push(len);
8200 addl(zlen,2);
8201 bind(L_second_loop);
8202 xorq(carry, carry);
8203 subl(zlen, 4);
8204 subl(len, 2);
8205 push(zlen);
8206 push(len);
8207 cmpl(len, 0);
8208 jccb(Assembler::lessEqual, L_second_loop_exit);
8209
8210 // Multiply an array by one 64 bit long.
8211 if (UseBMI2Instructions) {
8212 op2 = rdxReg;
8213 movq(op2, Address(x, len, Address::times_4, 0));
8214 rorxq(op2, op2, 32);
8215 }
8216 else {
8217 movq(op2, Address(x, len, Address::times_4, 0));
8218 rorq(op2, 32);
8219 }
8220
8221 bind(L_third_loop);
8222 decrementl(len);
8223 jccb(Assembler::negative, L_third_loop_exit);
8224 decrementl(len);
8225 jccb(Assembler::negative, L_last_x);
8226
8227 movq(op1, Address(x, len, Address::times_4, 0));
8228 rorq(op1, 32);
8229
8230 bind(L_multiply);
8231 subl(zlen, 2);
8232 movq(sum, Address(z, zlen, Address::times_4, 0));
8233
8234 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8235 if (UseBMI2Instructions) {
8236 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8237 }
8238 else {
8239 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8240 }
8241
8242 movq(Address(z, zlen, Address::times_4, 0), sum);
8243
8244 jmp(L_third_loop);
8245 bind(L_third_loop_exit);
8246
8247 // Fourth loop
8248 // Add 64 bit long carry into z with carry propagation.
8249 // Uses offsetted zlen.
8250 add_one_64(z, zlen, carry, tmp1);
8251
8252 pop(len);
8253 pop(zlen);
8254 jmp(L_second_loop);
8255
8256 // Next infrequent code is moved outside loops.
8257 bind(L_last_x);
8258 movl(op1, Address(x, 0));
8259 jmp(L_multiply);
8260
8261 bind(L_second_loop_exit);
8262 pop(len);
8263 pop(zlen);
8264 pop(len);
8265 pop(zlen);
8266
8267 // Fifth loop
8268 // Shift z left 1 bit.
8269 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8270
8271 // z[zlen-1] |= x[len-1] & 1;
8272 movl(tmp3, Address(x, len, Address::times_4, -4));
8273 andl(tmp3, 1);
8274 orl(Address(z, zlen, Address::times_4, -4), tmp3);
8275
8276 pop(tmp5);
8277 pop(tmp4);
8278 pop(tmp3);
8279 pop(tmp2);
8280 pop(tmp1);
8281 }
8282
8283 /**
8284 * Helper function for mul_add()
8285 * Multiply the in[] by int k and add to out[] starting at offset offs using
8286 * 128 bit by 32 bit multiply and return the carry in tmp5.
8287 * Only quad int aligned length of in[] is operated on in this function.
8288 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8289 * This function preserves out, in and k registers.
8290 * len and offset point to the appropriate index in "in" & "out" correspondingly
8291 * tmp5 has the carry.
8292 * other registers are temporary and are modified.
8293 *
8294 */
8295 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8296 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8297 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8298
8299 Label L_first_loop, L_first_loop_exit;
8300
8301 movl(tmp1, len);
8302 shrl(tmp1, 2);
8303
8304 bind(L_first_loop);
8305 subl(tmp1, 1);
8306 jccb(Assembler::negative, L_first_loop_exit);
8307
8308 subl(len, 4);
8309 subl(offset, 4);
8310
8311 Register op2 = tmp2;
8312 const Register sum = tmp3;
8313 const Register op1 = tmp4;
8314 const Register carry = tmp5;
8315
8316 if (UseBMI2Instructions) {
8317 op2 = rdxReg;
8318 }
8319
8320 movq(op1, Address(in, len, Address::times_4, 8));
8321 rorq(op1, 32);
8322 movq(sum, Address(out, offset, Address::times_4, 8));
8323 rorq(sum, 32);
8324 if (UseBMI2Instructions) {
8325 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8326 }
8327 else {
8328 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8329 }
8330 // Store back in big endian from little endian
8331 rorq(sum, 0x20);
8332 movq(Address(out, offset, Address::times_4, 8), sum);
8333
8334 movq(op1, Address(in, len, Address::times_4, 0));
8335 rorq(op1, 32);
8336 movq(sum, Address(out, offset, Address::times_4, 0));
8337 rorq(sum, 32);
8338 if (UseBMI2Instructions) {
8339 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8340 }
8341 else {
8342 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8343 }
8344 // Store back in big endian from little endian
8345 rorq(sum, 0x20);
8346 movq(Address(out, offset, Address::times_4, 0), sum);
8347
8348 jmp(L_first_loop);
8349 bind(L_first_loop_exit);
8350 }
8351
8352 /**
8353 * Code for BigInteger::mulAdd() intrinsic
8354 *
8355 * rdi: out
8356 * rsi: in
8357 * r11: offs (out.length - offset)
8358 * rcx: len
8359 * r8: k
8360 * r12: tmp1
8361 * r13: tmp2
8362 * r14: tmp3
8363 * r15: tmp4
8364 * rbx: tmp5
8365 * Multiply the in[] by word k and add to out[], return the carry in rax
8366 */
8367 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8368 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8369 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8370
8371 Label L_carry, L_last_in, L_done;
8372
8373 // carry = 0;
8374 // for (int j=len-1; j >= 0; j--) {
8375 // long product = (in[j] & LONG_MASK) * kLong +
8376 // (out[offs] & LONG_MASK) + carry;
8377 // out[offs--] = (int)product;
8378 // carry = product >>> 32;
8379 // }
8380 //
8381 push(tmp1);
8382 push(tmp2);
8383 push(tmp3);
8384 push(tmp4);
8385 push(tmp5);
8386
8387 Register op2 = tmp2;
8388 const Register sum = tmp3;
8389 const Register op1 = tmp4;
8390 const Register carry = tmp5;
8391
8392 if (UseBMI2Instructions) {
8393 op2 = rdxReg;
8394 movl(op2, k);
8395 }
8396 else {
8397 movl(op2, k);
8398 }
8399
8400 xorq(carry, carry);
8401
8402 //First loop
8403
8404 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8405 //The carry is in tmp5
8406 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8407
8408 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8409 decrementl(len);
8410 jccb(Assembler::negative, L_carry);
8411 decrementl(len);
8412 jccb(Assembler::negative, L_last_in);
8413
8414 movq(op1, Address(in, len, Address::times_4, 0));
8415 rorq(op1, 32);
8416
8417 subl(offs, 2);
8418 movq(sum, Address(out, offs, Address::times_4, 0));
8419 rorq(sum, 32);
8420
8421 if (UseBMI2Instructions) {
8422 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8423 }
8424 else {
8425 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8426 }
8427
8428 // Store back in big endian from little endian
8429 rorq(sum, 0x20);
8430 movq(Address(out, offs, Address::times_4, 0), sum);
8431
8432 testl(len, len);
8433 jccb(Assembler::zero, L_carry);
8434
8435 //Multiply the last in[] entry, if any
8436 bind(L_last_in);
8437 movl(op1, Address(in, 0));
8438 movl(sum, Address(out, offs, Address::times_4, -4));
8439
8440 movl(raxReg, k);
8441 mull(op1); //tmp4 * eax -> edx:eax
8442 addl(sum, carry);
8443 adcl(rdxReg, 0);
8444 addl(sum, raxReg);
8445 adcl(rdxReg, 0);
8446 movl(carry, rdxReg);
8447
8448 movl(Address(out, offs, Address::times_4, -4), sum);
8449
8450 bind(L_carry);
8451 //return tmp5/carry as carry in rax
8452 movl(rax, carry);
8453
8454 bind(L_done);
8455 pop(tmp5);
8456 pop(tmp4);
8457 pop(tmp3);
8458 pop(tmp2);
8459 pop(tmp1);
8460 }
8461
8462 /**
8463 * Emits code to update CRC-32 with a byte value according to constants in table
8464 *
8465 * @param [in,out]crc Register containing the crc.
8466 * @param [in]val Register containing the byte to fold into the CRC.
8467 * @param [in]table Register containing the table of crc constants.
8468 *
8469 * uint32_t crc;
8470 * val = crc_table[(val ^ crc) & 0xFF];
8471 * crc = val ^ (crc >> 8);
8472 *
8473 */
8474 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8475 xorl(val, crc);
8476 andl(val, 0xFF);
8477 shrl(crc, 8); // unsigned shift
8478 xorl(crc, Address(table, val, Address::times_4, 0));
8479 }
8480
8481 /**
8482 * Fold 128-bit data chunk
8483 */
8484 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8485 if (UseAVX > 0) {
8486 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
8487 vpclmulldq(xcrc, xK, xcrc); // [63:0]
8488 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
8489 pxor(xcrc, xtmp);
8490 } else {
8491 movdqa(xtmp, xcrc);
8492 pclmulhdq(xtmp, xK); // [123:64]
8493 pclmulldq(xcrc, xK); // [63:0]
8494 pxor(xcrc, xtmp);
8495 movdqu(xtmp, Address(buf, offset));
8496 pxor(xcrc, xtmp);
8497 }
8498 }
8499
8500 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
8501 if (UseAVX > 0) {
8502 vpclmulhdq(xtmp, xK, xcrc);
8503 vpclmulldq(xcrc, xK, xcrc);
8504 pxor(xcrc, xbuf);
8505 pxor(xcrc, xtmp);
8506 } else {
8507 movdqa(xtmp, xcrc);
8508 pclmulhdq(xtmp, xK);
8509 pclmulldq(xcrc, xK);
8510 pxor(xcrc, xbuf);
8511 pxor(xcrc, xtmp);
8512 }
8513 }
8514
8515 /**
8516 * 8-bit folds to compute 32-bit CRC
8517 *
8518 * uint64_t xcrc;
8519 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
8520 */
8521 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
8522 movdl(tmp, xcrc);
8523 andl(tmp, 0xFF);
8524 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
8525 psrldq(xcrc, 1); // unsigned shift one byte
8526 pxor(xcrc, xtmp);
8527 }
8528
8529 /**
8530 * uint32_t crc;
8531 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
8532 */
8533 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
8534 movl(tmp, crc);
8535 andl(tmp, 0xFF);
8536 shrl(crc, 8);
8537 xorl(crc, Address(table, tmp, Address::times_4, 0));
8538 }
8539
8540 /**
8541 * @param crc register containing existing CRC (32-bit)
8542 * @param buf register pointing to input byte buffer (byte*)
8543 * @param len register containing number of bytes
8544 * @param table register that will contain address of CRC table
8545 * @param tmp scratch register
8546 */
8547 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
8548 assert_different_registers(crc, buf, len, table, tmp, rax);
8549
8550 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8551 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8552
8553 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8554 // context for the registers used, where all instructions below are using 128-bit mode
8555 // On EVEX without VL and BW, these instructions will all be AVX.
8556 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
8557 notl(crc); // ~crc
8558 cmpl(len, 16);
8559 jcc(Assembler::less, L_tail);
8560
8561 // Align buffer to 16 bytes
8562 movl(tmp, buf);
8563 andl(tmp, 0xF);
8564 jccb(Assembler::zero, L_aligned);
8565 subl(tmp, 16);
8566 addl(len, tmp);
8567
8568 align(4);
8569 BIND(L_align_loop);
8570 movsbl(rax, Address(buf, 0)); // load byte with sign extension
8571 update_byte_crc32(crc, rax, table);
8572 increment(buf);
8573 incrementl(tmp);
8574 jccb(Assembler::less, L_align_loop);
8575
8576 BIND(L_aligned);
8577 movl(tmp, len); // save
8578 shrl(len, 4);
8579 jcc(Assembler::zero, L_tail_restore);
8580
8581 // Fold crc into first bytes of vector
8582 movdqa(xmm1, Address(buf, 0));
8583 movdl(rax, xmm1);
8584 xorl(crc, rax);
8585 if (VM_Version::supports_sse4_1()) {
8586 pinsrd(xmm1, crc, 0);
8587 } else {
8588 pinsrw(xmm1, crc, 0);
8589 shrl(crc, 16);
8590 pinsrw(xmm1, crc, 1);
8591 }
8592 addptr(buf, 16);
8593 subl(len, 4); // len > 0
8594 jcc(Assembler::less, L_fold_tail);
8595
8596 movdqa(xmm2, Address(buf, 0));
8597 movdqa(xmm3, Address(buf, 16));
8598 movdqa(xmm4, Address(buf, 32));
8599 addptr(buf, 48);
8600 subl(len, 3);
8601 jcc(Assembler::lessEqual, L_fold_512b);
8602
8603 // Fold total 512 bits of polynomial on each iteration,
8604 // 128 bits per each of 4 parallel streams.
8605 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
8606
8607 align32();
8608 BIND(L_fold_512b_loop);
8609 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
8610 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
8611 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
8612 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
8613 addptr(buf, 64);
8614 subl(len, 4);
8615 jcc(Assembler::greater, L_fold_512b_loop);
8616
8617 // Fold 512 bits to 128 bits.
8618 BIND(L_fold_512b);
8619 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
8620 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
8621 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
8622 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
8623
8624 // Fold the rest of 128 bits data chunks
8625 BIND(L_fold_tail);
8626 addl(len, 3);
8627 jccb(Assembler::lessEqual, L_fold_128b);
8628 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
8629
8630 BIND(L_fold_tail_loop);
8631 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
8632 addptr(buf, 16);
8633 decrementl(len);
8634 jccb(Assembler::greater, L_fold_tail_loop);
8635
8636 // Fold 128 bits in xmm1 down into 32 bits in crc register.
8637 BIND(L_fold_128b);
8638 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
8639 if (UseAVX > 0) {
8640 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
8641 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
8642 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
8643 } else {
8644 movdqa(xmm2, xmm0);
8645 pclmulqdq(xmm2, xmm1, 0x1);
8646 movdqa(xmm3, xmm0);
8647 pand(xmm3, xmm2);
8648 pclmulqdq(xmm0, xmm3, 0x1);
8649 }
8650 psrldq(xmm1, 8);
8651 psrldq(xmm2, 4);
8652 pxor(xmm0, xmm1);
8653 pxor(xmm0, xmm2);
8654
8655 // 8 8-bit folds to compute 32-bit CRC.
8656 for (int j = 0; j < 4; j++) {
8657 fold_8bit_crc32(xmm0, table, xmm1, rax);
8658 }
8659 movdl(crc, xmm0); // mov 32 bits to general register
8660 for (int j = 0; j < 4; j++) {
8661 fold_8bit_crc32(crc, table, rax);
8662 }
8663
8664 BIND(L_tail_restore);
8665 movl(len, tmp); // restore
8666 BIND(L_tail);
8667 andl(len, 0xf);
8668 jccb(Assembler::zero, L_exit);
8669
8670 // Fold the rest of bytes
8671 align(4);
8672 BIND(L_tail_loop);
8673 movsbl(rax, Address(buf, 0)); // load byte with sign extension
8674 update_byte_crc32(crc, rax, table);
8675 increment(buf);
8676 decrementl(len);
8677 jccb(Assembler::greater, L_tail_loop);
8678
8679 BIND(L_exit);
8680 notl(crc); // ~c
8681 }
8682
8683 // Helper function for AVX 512 CRC32
8684 // Fold 512-bit data chunks
8685 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
8686 Register pos, int offset) {
8687 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
8688 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
8689 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
8690 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
8691 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
8692 }
8693
8694 // Helper function for AVX 512 CRC32
8695 // Compute CRC32 for < 256B buffers
8696 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
8697 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
8698 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
8699
8700 Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
8701 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
8702 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
8703
8704 // check if there is enough buffer to be able to fold 16B at a time
8705 cmpl(len, 32);
8706 jcc(Assembler::less, L_less_than_32);
8707
8708 // if there is, load the constants
8709 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
8710 movdl(xmm0, crc); // get the initial crc value
8711 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
8712 pxor(xmm7, xmm0);
8713
8714 // update the buffer pointer
8715 addl(pos, 16);
8716 //update the counter.subtract 32 instead of 16 to save one instruction from the loop
8717 subl(len, 32);
8718 jmp(L_16B_reduction_loop);
8719
8720 bind(L_less_than_32);
8721 //mov initial crc to the return value. this is necessary for zero - length buffers.
8722 movl(rax, crc);
8723 testl(len, len);
8724 jcc(Assembler::equal, L_cleanup);
8725
8726 movdl(xmm0, crc); //get the initial crc value
8727
8728 cmpl(len, 16);
8729 jcc(Assembler::equal, L_exact_16_left);
8730 jcc(Assembler::less, L_less_than_16_left);
8731
8732 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
8733 pxor(xmm7, xmm0); //xor the initial crc value
8734 addl(pos, 16);
8735 subl(len, 16);
8736 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
8737 jmp(L_get_last_two_xmms);
8738
8739 bind(L_less_than_16_left);
8740 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
8741 pxor(xmm1, xmm1);
8742 movptr(tmp1, rsp);
8743 movdqu(Address(tmp1, 0 * 16), xmm1);
8744
8745 cmpl(len, 4);
8746 jcc(Assembler::less, L_only_less_than_4);
8747
8748 //backup the counter value
8749 movl(tmp2, len);
8750 cmpl(len, 8);
8751 jcc(Assembler::less, L_less_than_8_left);
8752
8753 //load 8 Bytes
8754 movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
8755 movq(Address(tmp1, 0 * 16), rax);
8756 addptr(tmp1, 8);
8757 subl(len, 8);
8758 addl(pos, 8);
8759
8760 bind(L_less_than_8_left);
8761 cmpl(len, 4);
8762 jcc(Assembler::less, L_less_than_4_left);
8763
8764 //load 4 Bytes
8765 movl(rax, Address(buf, pos, Address::times_1, 0));
8766 movl(Address(tmp1, 0 * 16), rax);
8767 addptr(tmp1, 4);
8768 subl(len, 4);
8769 addl(pos, 4);
8770
8771 bind(L_less_than_4_left);
8772 cmpl(len, 2);
8773 jcc(Assembler::less, L_less_than_2_left);
8774
8775 // load 2 Bytes
8776 movw(rax, Address(buf, pos, Address::times_1, 0));
8777 movl(Address(tmp1, 0 * 16), rax);
8778 addptr(tmp1, 2);
8779 subl(len, 2);
8780 addl(pos, 2);
8781
8782 bind(L_less_than_2_left);
8783 cmpl(len, 1);
8784 jcc(Assembler::less, L_zero_left);
8785
8786 // load 1 Byte
8787 movb(rax, Address(buf, pos, Address::times_1, 0));
8788 movb(Address(tmp1, 0 * 16), rax);
8789
8790 bind(L_zero_left);
8791 movdqu(xmm7, Address(rsp, 0));
8792 pxor(xmm7, xmm0); //xor the initial crc value
8793
8794 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8795 movdqu(xmm0, Address(rax, tmp2));
8796 pshufb(xmm7, xmm0);
8797 jmp(L_128_done);
8798
8799 bind(L_exact_16_left);
8800 movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
8801 pxor(xmm7, xmm0); //xor the initial crc value
8802 jmp(L_128_done);
8803
8804 bind(L_only_less_than_4);
8805 cmpl(len, 3);
8806 jcc(Assembler::less, L_only_less_than_3);
8807
8808 // load 3 Bytes
8809 movb(rax, Address(buf, pos, Address::times_1, 0));
8810 movb(Address(tmp1, 0), rax);
8811
8812 movb(rax, Address(buf, pos, Address::times_1, 1));
8813 movb(Address(tmp1, 1), rax);
8814
8815 movb(rax, Address(buf, pos, Address::times_1, 2));
8816 movb(Address(tmp1, 2), rax);
8817
8818 movdqu(xmm7, Address(rsp, 0));
8819 pxor(xmm7, xmm0); //xor the initial crc value
8820
8821 pslldq(xmm7, 0x5);
8822 jmp(L_barrett);
8823 bind(L_only_less_than_3);
8824 cmpl(len, 2);
8825 jcc(Assembler::less, L_only_less_than_2);
8826
8827 // load 2 Bytes
8828 movb(rax, Address(buf, pos, Address::times_1, 0));
8829 movb(Address(tmp1, 0), rax);
8830
8831 movb(rax, Address(buf, pos, Address::times_1, 1));
8832 movb(Address(tmp1, 1), rax);
8833
8834 movdqu(xmm7, Address(rsp, 0));
8835 pxor(xmm7, xmm0); //xor the initial crc value
8836
8837 pslldq(xmm7, 0x6);
8838 jmp(L_barrett);
8839
8840 bind(L_only_less_than_2);
8841 //load 1 Byte
8842 movb(rax, Address(buf, pos, Address::times_1, 0));
8843 movb(Address(tmp1, 0), rax);
8844
8845 movdqu(xmm7, Address(rsp, 0));
8846 pxor(xmm7, xmm0); //xor the initial crc value
8847
8848 pslldq(xmm7, 0x7);
8849 }
8850
8851 /**
8852 * Compute CRC32 using AVX512 instructions
8853 * param crc register containing existing CRC (32-bit)
8854 * param buf register pointing to input byte buffer (byte*)
8855 * param len register containing number of bytes
8856 * param table address of crc or crc32c table
8857 * param tmp1 scratch register
8858 * param tmp2 scratch register
8859 * return rax result register
8860 *
8861 * This routine is identical for crc32c with the exception of the precomputed constant
8862 * table which will be passed as the table argument. The calculation steps are
8863 * the same for both variants.
8864 */
8865 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
8866 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
8867
8868 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8869 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8870 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
8871 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
8872 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
8873
8874 const Register pos = r12;
8875 push(r12);
8876 subptr(rsp, 16 * 2 + 8);
8877
8878 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8879 // context for the registers used, where all instructions below are using 128-bit mode
8880 // On EVEX without VL and BW, these instructions will all be AVX.
8881 movl(pos, 0);
8882
8883 // check if smaller than 256B
8884 cmpl(len, 256);
8885 jcc(Assembler::less, L_less_than_256);
8886
8887 // load the initial crc value
8888 movdl(xmm10, crc);
8889
8890 // receive the initial 64B data, xor the initial crc value
8891 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
8892 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
8893 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
8894 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
8895
8896 subl(len, 256);
8897 cmpl(len, 256);
8898 jcc(Assembler::less, L_fold_128_B_loop);
8899
8900 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
8901 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
8902 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
8903 subl(len, 256);
8904
8905 bind(L_fold_256_B_loop);
8906 addl(pos, 256);
8907 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
8908 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
8909 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
8910 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
8911
8912 subl(len, 256);
8913 jcc(Assembler::greaterEqual, L_fold_256_B_loop);
8914
8915 // Fold 256 into 128
8916 addl(pos, 256);
8917 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
8918 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
8919 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
8920
8921 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
8922 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
8923 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
8924
8925 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
8926 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
8927
8928 addl(len, 128);
8929 jmp(L_fold_128_B_register);
8930
8931 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
8932 // loop will fold 128B at a time until we have 128 + y Bytes of buffer
8933
8934 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
8935 bind(L_fold_128_B_loop);
8936 addl(pos, 128);
8937 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
8938 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
8939
8940 subl(len, 128);
8941 jcc(Assembler::greaterEqual, L_fold_128_B_loop);
8942
8943 addl(pos, 128);
8944
8945 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
8946 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
8947 bind(L_fold_128_B_register);
8948 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
8949 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
8950 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
8951 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
8952 // save last that has no multiplicand
8953 vextracti64x2(xmm7, xmm4, 3);
8954
8955 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
8956 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
8957 // Needed later in reduction loop
8958 movdqu(xmm10, Address(table, 1 * 16));
8959 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
8960 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
8961
8962 // Swap 1,0,3,2 - 01 00 11 10
8963 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
8964 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
8965 vextracti128(xmm5, xmm8, 1);
8966 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
8967
8968 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
8969 // instead of a cmp instruction, we use the negative flag with the jl instruction
8970 addl(len, 128 - 16);
8971 jcc(Assembler::less, L_final_reduction_for_128);
8972
8973 bind(L_16B_reduction_loop);
8974 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8975 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8976 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8977 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
8978 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8979 addl(pos, 16);
8980 subl(len, 16);
8981 jcc(Assembler::greaterEqual, L_16B_reduction_loop);
8982
8983 bind(L_final_reduction_for_128);
8984 addl(len, 16);
8985 jcc(Assembler::equal, L_128_done);
8986
8987 bind(L_get_last_two_xmms);
8988 movdqu(xmm2, xmm7);
8989 addl(pos, len);
8990 movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
8991 subl(pos, len);
8992
8993 // get rid of the extra data that was loaded before
8994 // load the shift constant
8995 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8996 movdqu(xmm0, Address(rax, len));
8997 addl(rax, len);
8998
8999 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
9000 //Change mask to 512
9001 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
9002 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
9003
9004 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
9005 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
9006 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
9007 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
9008 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
9009
9010 bind(L_128_done);
9011 // compute crc of a 128-bit value
9012 movdqu(xmm10, Address(table, 3 * 16));
9013 movdqu(xmm0, xmm7);
9014
9015 // 64b fold
9016 vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
9017 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
9018 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
9019
9020 // 32b fold
9021 movdqu(xmm0, xmm7);
9022 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
9023 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
9024 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
9025 jmp(L_barrett);
9026
9027 bind(L_less_than_256);
9028 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
9029
9030 //barrett reduction
9031 bind(L_barrett);
9032 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
9033 movdqu(xmm1, xmm7);
9034 movdqu(xmm2, xmm7);
9035 movdqu(xmm10, Address(table, 4 * 16));
9036
9037 pclmulqdq(xmm7, xmm10, 0x0);
9038 pxor(xmm7, xmm2);
9039 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
9040 movdqu(xmm2, xmm7);
9041 pclmulqdq(xmm7, xmm10, 0x10);
9042 pxor(xmm7, xmm2);
9043 pxor(xmm7, xmm1);
9044 pextrd(crc, xmm7, 2);
9045
9046 bind(L_cleanup);
9047 addptr(rsp, 16 * 2 + 8);
9048 pop(r12);
9049 }
9050
9051 // S. Gueron / Information Processing Letters 112 (2012) 184
9052 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
9053 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
9054 // Output: the 64-bit carry-less product of B * CONST
9055 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
9056 Register tmp1, Register tmp2, Register tmp3) {
9057 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9058 if (n > 0) {
9059 addq(tmp3, n * 256 * 8);
9060 }
9061 // Q1 = TABLEExt[n][B & 0xFF];
9062 movl(tmp1, in);
9063 andl(tmp1, 0x000000FF);
9064 shll(tmp1, 3);
9065 addq(tmp1, tmp3);
9066 movq(tmp1, Address(tmp1, 0));
9067
9068 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
9069 movl(tmp2, in);
9070 shrl(tmp2, 8);
9071 andl(tmp2, 0x000000FF);
9072 shll(tmp2, 3);
9073 addq(tmp2, tmp3);
9074 movq(tmp2, Address(tmp2, 0));
9075
9076 shlq(tmp2, 8);
9077 xorq(tmp1, tmp2);
9078
9079 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
9080 movl(tmp2, in);
9081 shrl(tmp2, 16);
9082 andl(tmp2, 0x000000FF);
9083 shll(tmp2, 3);
9084 addq(tmp2, tmp3);
9085 movq(tmp2, Address(tmp2, 0));
9086
9087 shlq(tmp2, 16);
9088 xorq(tmp1, tmp2);
9089
9090 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
9091 shrl(in, 24);
9092 andl(in, 0x000000FF);
9093 shll(in, 3);
9094 addq(in, tmp3);
9095 movq(in, Address(in, 0));
9096
9097 shlq(in, 24);
9098 xorq(in, tmp1);
9099 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9100 }
9101
9102 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9103 Register in_out,
9104 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9105 XMMRegister w_xtmp2,
9106 Register tmp1,
9107 Register n_tmp2, Register n_tmp3) {
9108 if (is_pclmulqdq_supported) {
9109 movdl(w_xtmp1, in_out); // modified blindly
9110
9111 movl(tmp1, const_or_pre_comp_const_index);
9112 movdl(w_xtmp2, tmp1);
9113 pclmulqdq(w_xtmp1, w_xtmp2, 0);
9114
9115 movdq(in_out, w_xtmp1);
9116 } else {
9117 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
9118 }
9119 }
9120
9121 // Recombination Alternative 2: No bit-reflections
9122 // T1 = (CRC_A * U1) << 1
9123 // T2 = (CRC_B * U2) << 1
9124 // C1 = T1 >> 32
9125 // C2 = T2 >> 32
9126 // T1 = T1 & 0xFFFFFFFF
9127 // T2 = T2 & 0xFFFFFFFF
9128 // T1 = CRC32(0, T1)
9129 // T2 = CRC32(0, T2)
9130 // C1 = C1 ^ T1
9131 // C2 = C2 ^ T2
9132 // CRC = C1 ^ C2 ^ CRC_C
9133 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9134 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9135 Register tmp1, Register tmp2,
9136 Register n_tmp3) {
9137 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9138 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9139 shlq(in_out, 1);
9140 movl(tmp1, in_out);
9141 shrq(in_out, 32);
9142 xorl(tmp2, tmp2);
9143 crc32(tmp2, tmp1, 4);
9144 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
9145 shlq(in1, 1);
9146 movl(tmp1, in1);
9147 shrq(in1, 32);
9148 xorl(tmp2, tmp2);
9149 crc32(tmp2, tmp1, 4);
9150 xorl(in1, tmp2);
9151 xorl(in_out, in1);
9152 xorl(in_out, in2);
9153 }
9154
9155 // Set N to predefined value
9156 // Subtract from a length of a buffer
9157 // execute in a loop:
9158 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
9159 // for i = 1 to N do
9160 // CRC_A = CRC32(CRC_A, A[i])
9161 // CRC_B = CRC32(CRC_B, B[i])
9162 // CRC_C = CRC32(CRC_C, C[i])
9163 // end for
9164 // Recombine
9165 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9166 Register in_out1, Register in_out2, Register in_out3,
9167 Register tmp1, Register tmp2, Register tmp3,
9168 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9169 Register tmp4, Register tmp5,
9170 Register n_tmp6) {
9171 Label L_processPartitions;
9172 Label L_processPartition;
9173 Label L_exit;
9174
9175 bind(L_processPartitions);
9176 cmpl(in_out1, 3 * size);
9177 jcc(Assembler::less, L_exit);
9178 xorl(tmp1, tmp1);
9179 xorl(tmp2, tmp2);
9180 movq(tmp3, in_out2);
9181 addq(tmp3, size);
9182
9183 bind(L_processPartition);
9184 crc32(in_out3, Address(in_out2, 0), 8);
9185 crc32(tmp1, Address(in_out2, size), 8);
9186 crc32(tmp2, Address(in_out2, size * 2), 8);
9187 addq(in_out2, 8);
9188 cmpq(in_out2, tmp3);
9189 jcc(Assembler::less, L_processPartition);
9190 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9191 w_xtmp1, w_xtmp2, w_xtmp3,
9192 tmp4, tmp5,
9193 n_tmp6);
9194 addq(in_out2, 2 * size);
9195 subl(in_out1, 3 * size);
9196 jmp(L_processPartitions);
9197
9198 bind(L_exit);
9199 }
9200
9201 // Algorithm 2: Pipelined usage of the CRC32 instruction.
9202 // Input: A buffer I of L bytes.
9203 // Output: the CRC32C value of the buffer.
9204 // Notations:
9205 // Write L = 24N + r, with N = floor (L/24).
9206 // r = L mod 24 (0 <= r < 24).
9207 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
9208 // N quadwords, and R consists of r bytes.
9209 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
9210 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
9211 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
9212 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
9213 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9214 Register tmp1, Register tmp2, Register tmp3,
9215 Register tmp4, Register tmp5, Register tmp6,
9216 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9217 bool is_pclmulqdq_supported) {
9218 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9219 Label L_wordByWord;
9220 Label L_byteByByteProlog;
9221 Label L_byteByByte;
9222 Label L_exit;
9223
9224 if (is_pclmulqdq_supported ) {
9225 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr();
9226 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1);
9227
9228 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2);
9229 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3);
9230
9231 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4);
9232 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5);
9233 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
9234 } else {
9235 const_or_pre_comp_const_index[0] = 1;
9236 const_or_pre_comp_const_index[1] = 0;
9237
9238 const_or_pre_comp_const_index[2] = 3;
9239 const_or_pre_comp_const_index[3] = 2;
9240
9241 const_or_pre_comp_const_index[4] = 5;
9242 const_or_pre_comp_const_index[5] = 4;
9243 }
9244 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9245 in2, in1, in_out,
9246 tmp1, tmp2, tmp3,
9247 w_xtmp1, w_xtmp2, w_xtmp3,
9248 tmp4, tmp5,
9249 tmp6);
9250 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9251 in2, in1, in_out,
9252 tmp1, tmp2, tmp3,
9253 w_xtmp1, w_xtmp2, w_xtmp3,
9254 tmp4, tmp5,
9255 tmp6);
9256 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9257 in2, in1, in_out,
9258 tmp1, tmp2, tmp3,
9259 w_xtmp1, w_xtmp2, w_xtmp3,
9260 tmp4, tmp5,
9261 tmp6);
9262 movl(tmp1, in2);
9263 andl(tmp1, 0x00000007);
9264 negl(tmp1);
9265 addl(tmp1, in2);
9266 addq(tmp1, in1);
9267
9268 cmpq(in1, tmp1);
9269 jccb(Assembler::greaterEqual, L_byteByByteProlog);
9270 align(16);
9271 BIND(L_wordByWord);
9272 crc32(in_out, Address(in1, 0), 8);
9273 addq(in1, 8);
9274 cmpq(in1, tmp1);
9275 jcc(Assembler::less, L_wordByWord);
9276
9277 BIND(L_byteByByteProlog);
9278 andl(in2, 0x00000007);
9279 movl(tmp2, 1);
9280
9281 cmpl(tmp2, in2);
9282 jccb(Assembler::greater, L_exit);
9283 BIND(L_byteByByte);
9284 crc32(in_out, Address(in1, 0), 1);
9285 incq(in1);
9286 incl(tmp2);
9287 cmpl(tmp2, in2);
9288 jcc(Assembler::lessEqual, L_byteByByte);
9289
9290 BIND(L_exit);
9291 }
9292 #undef BIND
9293 #undef BLOCK_COMMENT
9294
9295 // Compress char[] array to byte[].
9296 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
9297 // Return the array length if every element in array can be encoded,
9298 // otherwise, the index of first non-latin1 (> 0xff) character.
9299 // @IntrinsicCandidate
9300 // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
9301 // for (int i = 0; i < len; i++) {
9302 // char c = src[srcOff];
9303 // if (c > 0xff) {
9304 // return i; // return index of non-latin1 char
9305 // }
9306 // dst[dstOff] = (byte)c;
9307 // srcOff++;
9308 // dstOff++;
9309 // }
9310 // return len;
9311 // }
9312 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
9313 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
9314 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
9315 Register tmp5, Register result, KRegister mask1, KRegister mask2) {
9316 Label copy_chars_loop, done, reset_sp, copy_tail;
9317
9318 // rsi: src
9319 // rdi: dst
9320 // rdx: len
9321 // rcx: tmp5
9322 // rax: result
9323
9324 // rsi holds start addr of source char[] to be compressed
9325 // rdi holds start addr of destination byte[]
9326 // rdx holds length
9327
9328 assert(len != result, "");
9329
9330 // save length for return
9331 movl(result, len);
9332
9333 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
9334 VM_Version::supports_avx512vlbw() &&
9335 VM_Version::supports_bmi2()) {
9336
9337 Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
9338
9339 // alignment
9340 Label post_alignment;
9341
9342 // if length of the string is less than 32, handle it the old fashioned way
9343 testl(len, -32);
9344 jcc(Assembler::zero, below_threshold);
9345
9346 // First check whether a character is compressible ( <= 0xFF).
9347 // Create mask to test for Unicode chars inside zmm vector
9348 movl(tmp5, 0x00FF);
9349 evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
9350
9351 testl(len, -64);
9352 jccb(Assembler::zero, post_alignment);
9353
9354 movl(tmp5, dst);
9355 andl(tmp5, (32 - 1));
9356 negl(tmp5);
9357 andl(tmp5, (32 - 1));
9358
9359 // bail out when there is nothing to be done
9360 testl(tmp5, 0xFFFFFFFF);
9361 jccb(Assembler::zero, post_alignment);
9362
9363 // ~(~0 << len), where len is the # of remaining elements to process
9364 movl(len, 0xFFFFFFFF);
9365 shlxl(len, len, tmp5);
9366 notl(len);
9367 kmovdl(mask2, len);
9368 movl(len, result);
9369
9370 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
9371 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
9372 ktestd(mask1, mask2);
9373 jcc(Assembler::carryClear, copy_tail);
9374
9375 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
9376
9377 addptr(src, tmp5);
9378 addptr(src, tmp5);
9379 addptr(dst, tmp5);
9380 subl(len, tmp5);
9381
9382 bind(post_alignment);
9383 // end of alignment
9384
9385 movl(tmp5, len);
9386 andl(tmp5, (32 - 1)); // tail count (in chars)
9387 andl(len, ~(32 - 1)); // vector count (in chars)
9388 jccb(Assembler::zero, copy_loop_tail);
9389
9390 lea(src, Address(src, len, Address::times_2));
9391 lea(dst, Address(dst, len, Address::times_1));
9392 negptr(len);
9393
9394 bind(copy_32_loop);
9395 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
9396 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9397 kortestdl(mask1, mask1);
9398 jccb(Assembler::carryClear, reset_for_copy_tail);
9399
9400 // All elements in current processed chunk are valid candidates for
9401 // compression. Write a truncated byte elements to the memory.
9402 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
9403 addptr(len, 32);
9404 jccb(Assembler::notZero, copy_32_loop);
9405
9406 bind(copy_loop_tail);
9407 // bail out when there is nothing to be done
9408 testl(tmp5, 0xFFFFFFFF);
9409 jcc(Assembler::zero, done);
9410
9411 movl(len, tmp5);
9412
9413 // ~(~0 << len), where len is the # of remaining elements to process
9414 movl(tmp5, 0xFFFFFFFF);
9415 shlxl(tmp5, tmp5, len);
9416 notl(tmp5);
9417
9418 kmovdl(mask2, tmp5);
9419
9420 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
9421 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
9422 ktestd(mask1, mask2);
9423 jcc(Assembler::carryClear, copy_tail);
9424
9425 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
9426 jmp(done);
9427
9428 bind(reset_for_copy_tail);
9429 lea(src, Address(src, tmp5, Address::times_2));
9430 lea(dst, Address(dst, tmp5, Address::times_1));
9431 subptr(len, tmp5);
9432 jmp(copy_chars_loop);
9433
9434 bind(below_threshold);
9435 }
9436
9437 if (UseSSE42Intrinsics) {
9438 Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
9439
9440 // vectored compression
9441 testl(len, 0xfffffff8);
9442 jcc(Assembler::zero, copy_tail);
9443
9444 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
9445 movdl(tmp1Reg, tmp5);
9446 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
9447
9448 andl(len, 0xfffffff0);
9449 jccb(Assembler::zero, copy_16);
9450
9451 // compress 16 chars per iter
9452 pxor(tmp4Reg, tmp4Reg);
9453
9454 lea(src, Address(src, len, Address::times_2));
9455 lea(dst, Address(dst, len, Address::times_1));
9456 negptr(len);
9457
9458 bind(copy_32_loop);
9459 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
9460 por(tmp4Reg, tmp2Reg);
9461 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
9462 por(tmp4Reg, tmp3Reg);
9463 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
9464 jccb(Assembler::notZero, reset_for_copy_tail);
9465 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
9466 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
9467 addptr(len, 16);
9468 jccb(Assembler::notZero, copy_32_loop);
9469
9470 // compress next vector of 8 chars (if any)
9471 bind(copy_16);
9472 // len = 0
9473 testl(result, 0x00000008); // check if there's a block of 8 chars to compress
9474 jccb(Assembler::zero, copy_tail_sse);
9475
9476 pxor(tmp3Reg, tmp3Reg);
9477
9478 movdqu(tmp2Reg, Address(src, 0));
9479 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
9480 jccb(Assembler::notZero, reset_for_copy_tail);
9481 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
9482 movq(Address(dst, 0), tmp2Reg);
9483 addptr(src, 16);
9484 addptr(dst, 8);
9485 jmpb(copy_tail_sse);
9486
9487 bind(reset_for_copy_tail);
9488 movl(tmp5, result);
9489 andl(tmp5, 0x0000000f);
9490 lea(src, Address(src, tmp5, Address::times_2));
9491 lea(dst, Address(dst, tmp5, Address::times_1));
9492 subptr(len, tmp5);
9493 jmpb(copy_chars_loop);
9494
9495 bind(copy_tail_sse);
9496 movl(len, result);
9497 andl(len, 0x00000007); // tail count (in chars)
9498 }
9499 // compress 1 char per iter
9500 bind(copy_tail);
9501 testl(len, len);
9502 jccb(Assembler::zero, done);
9503 lea(src, Address(src, len, Address::times_2));
9504 lea(dst, Address(dst, len, Address::times_1));
9505 negptr(len);
9506
9507 bind(copy_chars_loop);
9508 load_unsigned_short(tmp5, Address(src, len, Address::times_2));
9509 testl(tmp5, 0xff00); // check if Unicode char
9510 jccb(Assembler::notZero, reset_sp);
9511 movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte
9512 increment(len);
9513 jccb(Assembler::notZero, copy_chars_loop);
9514
9515 // add len then return (len will be zero if compress succeeded, otherwise negative)
9516 bind(reset_sp);
9517 addl(result, len);
9518
9519 bind(done);
9520 }
9521
9522 // Inflate byte[] array to char[].
9523 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
9524 // @IntrinsicCandidate
9525 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
9526 // for (int i = 0; i < len; i++) {
9527 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
9528 // }
9529 // }
9530 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
9531 XMMRegister tmp1, Register tmp2, KRegister mask) {
9532 Label copy_chars_loop, done, below_threshold, avx3_threshold;
9533 // rsi: src
9534 // rdi: dst
9535 // rdx: len
9536 // rcx: tmp2
9537
9538 // rsi holds start addr of source byte[] to be inflated
9539 // rdi holds start addr of destination char[]
9540 // rdx holds length
9541 assert_different_registers(src, dst, len, tmp2);
9542 movl(tmp2, len);
9543 if ((UseAVX > 2) && // AVX512
9544 VM_Version::supports_avx512vlbw() &&
9545 VM_Version::supports_bmi2()) {
9546
9547 Label copy_32_loop, copy_tail;
9548 Register tmp3_aliased = len;
9549
9550 // if length of the string is less than 16, handle it in an old fashioned way
9551 testl(len, -16);
9552 jcc(Assembler::zero, below_threshold);
9553
9554 testl(len, -1 * AVX3Threshold);
9555 jcc(Assembler::zero, avx3_threshold);
9556
9557 // In order to use only one arithmetic operation for the main loop we use
9558 // this pre-calculation
9559 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
9560 andl(len, -32); // vector count
9561 jccb(Assembler::zero, copy_tail);
9562
9563 lea(src, Address(src, len, Address::times_1));
9564 lea(dst, Address(dst, len, Address::times_2));
9565 negptr(len);
9566
9567
9568 // inflate 32 chars per iter
9569 bind(copy_32_loop);
9570 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
9571 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
9572 addptr(len, 32);
9573 jcc(Assembler::notZero, copy_32_loop);
9574
9575 bind(copy_tail);
9576 // bail out when there is nothing to be done
9577 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
9578 jcc(Assembler::zero, done);
9579
9580 // ~(~0 << length), where length is the # of remaining elements to process
9581 movl(tmp3_aliased, -1);
9582 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
9583 notl(tmp3_aliased);
9584 kmovdl(mask, tmp3_aliased);
9585 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
9586 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
9587
9588 jmp(done);
9589 bind(avx3_threshold);
9590 }
9591 if (UseSSE42Intrinsics) {
9592 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
9593
9594 if (UseAVX > 1) {
9595 andl(tmp2, (16 - 1));
9596 andl(len, -16);
9597 jccb(Assembler::zero, copy_new_tail);
9598 } else {
9599 andl(tmp2, 0x00000007); // tail count (in chars)
9600 andl(len, 0xfffffff8); // vector count (in chars)
9601 jccb(Assembler::zero, copy_tail);
9602 }
9603
9604 // vectored inflation
9605 lea(src, Address(src, len, Address::times_1));
9606 lea(dst, Address(dst, len, Address::times_2));
9607 negptr(len);
9608
9609 if (UseAVX > 1) {
9610 bind(copy_16_loop);
9611 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
9612 vmovdqu(Address(dst, len, Address::times_2), tmp1);
9613 addptr(len, 16);
9614 jcc(Assembler::notZero, copy_16_loop);
9615
9616 bind(below_threshold);
9617 bind(copy_new_tail);
9618 movl(len, tmp2);
9619 andl(tmp2, 0x00000007);
9620 andl(len, 0xFFFFFFF8);
9621 jccb(Assembler::zero, copy_tail);
9622
9623 pmovzxbw(tmp1, Address(src, 0));
9624 movdqu(Address(dst, 0), tmp1);
9625 addptr(src, 8);
9626 addptr(dst, 2 * 8);
9627
9628 jmp(copy_tail, true);
9629 }
9630
9631 // inflate 8 chars per iter
9632 bind(copy_8_loop);
9633 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
9634 movdqu(Address(dst, len, Address::times_2), tmp1);
9635 addptr(len, 8);
9636 jcc(Assembler::notZero, copy_8_loop);
9637
9638 bind(copy_tail);
9639 movl(len, tmp2);
9640
9641 cmpl(len, 4);
9642 jccb(Assembler::less, copy_bytes);
9643
9644 movdl(tmp1, Address(src, 0)); // load 4 byte chars
9645 pmovzxbw(tmp1, tmp1);
9646 movq(Address(dst, 0), tmp1);
9647 subptr(len, 4);
9648 addptr(src, 4);
9649 addptr(dst, 8);
9650
9651 bind(copy_bytes);
9652 } else {
9653 bind(below_threshold);
9654 }
9655
9656 testl(len, len);
9657 jccb(Assembler::zero, done);
9658 lea(src, Address(src, len, Address::times_1));
9659 lea(dst, Address(dst, len, Address::times_2));
9660 negptr(len);
9661
9662 // inflate 1 char per iter
9663 bind(copy_chars_loop);
9664 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
9665 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
9666 increment(len);
9667 jcc(Assembler::notZero, copy_chars_loop);
9668
9669 bind(done);
9670 }
9671
9672 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
9673 switch(type) {
9674 case T_BYTE:
9675 case T_BOOLEAN:
9676 evmovdqub(dst, kmask, src, merge, vector_len);
9677 break;
9678 case T_CHAR:
9679 case T_SHORT:
9680 evmovdquw(dst, kmask, src, merge, vector_len);
9681 break;
9682 case T_INT:
9683 case T_FLOAT:
9684 evmovdqul(dst, kmask, src, merge, vector_len);
9685 break;
9686 case T_LONG:
9687 case T_DOUBLE:
9688 evmovdquq(dst, kmask, src, merge, vector_len);
9689 break;
9690 default:
9691 fatal("Unexpected type argument %s", type2name(type));
9692 break;
9693 }
9694 }
9695
9696
9697 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
9698 switch(type) {
9699 case T_BYTE:
9700 case T_BOOLEAN:
9701 evmovdqub(dst, kmask, src, merge, vector_len);
9702 break;
9703 case T_CHAR:
9704 case T_SHORT:
9705 evmovdquw(dst, kmask, src, merge, vector_len);
9706 break;
9707 case T_INT:
9708 case T_FLOAT:
9709 evmovdqul(dst, kmask, src, merge, vector_len);
9710 break;
9711 case T_LONG:
9712 case T_DOUBLE:
9713 evmovdquq(dst, kmask, src, merge, vector_len);
9714 break;
9715 default:
9716 fatal("Unexpected type argument %s", type2name(type));
9717 break;
9718 }
9719 }
9720
9721 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
9722 switch(type) {
9723 case T_BYTE:
9724 case T_BOOLEAN:
9725 evmovdqub(dst, kmask, src, merge, vector_len);
9726 break;
9727 case T_CHAR:
9728 case T_SHORT:
9729 evmovdquw(dst, kmask, src, merge, vector_len);
9730 break;
9731 case T_INT:
9732 case T_FLOAT:
9733 evmovdqul(dst, kmask, src, merge, vector_len);
9734 break;
9735 case T_LONG:
9736 case T_DOUBLE:
9737 evmovdquq(dst, kmask, src, merge, vector_len);
9738 break;
9739 default:
9740 fatal("Unexpected type argument %s", type2name(type));
9741 break;
9742 }
9743 }
9744
9745 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
9746 switch(masklen) {
9747 case 2:
9748 knotbl(dst, src);
9749 movl(rtmp, 3);
9750 kmovbl(ktmp, rtmp);
9751 kandbl(dst, ktmp, dst);
9752 break;
9753 case 4:
9754 knotbl(dst, src);
9755 movl(rtmp, 15);
9756 kmovbl(ktmp, rtmp);
9757 kandbl(dst, ktmp, dst);
9758 break;
9759 case 8:
9760 knotbl(dst, src);
9761 break;
9762 case 16:
9763 knotwl(dst, src);
9764 break;
9765 case 32:
9766 knotdl(dst, src);
9767 break;
9768 case 64:
9769 knotql(dst, src);
9770 break;
9771 default:
9772 fatal("Unexpected vector length %d", masklen);
9773 break;
9774 }
9775 }
9776
9777 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9778 switch(type) {
9779 case T_BOOLEAN:
9780 case T_BYTE:
9781 kandbl(dst, src1, src2);
9782 break;
9783 case T_CHAR:
9784 case T_SHORT:
9785 kandwl(dst, src1, src2);
9786 break;
9787 case T_INT:
9788 case T_FLOAT:
9789 kanddl(dst, src1, src2);
9790 break;
9791 case T_LONG:
9792 case T_DOUBLE:
9793 kandql(dst, src1, src2);
9794 break;
9795 default:
9796 fatal("Unexpected type argument %s", type2name(type));
9797 break;
9798 }
9799 }
9800
9801 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9802 switch(type) {
9803 case T_BOOLEAN:
9804 case T_BYTE:
9805 korbl(dst, src1, src2);
9806 break;
9807 case T_CHAR:
9808 case T_SHORT:
9809 korwl(dst, src1, src2);
9810 break;
9811 case T_INT:
9812 case T_FLOAT:
9813 kordl(dst, src1, src2);
9814 break;
9815 case T_LONG:
9816 case T_DOUBLE:
9817 korql(dst, src1, src2);
9818 break;
9819 default:
9820 fatal("Unexpected type argument %s", type2name(type));
9821 break;
9822 }
9823 }
9824
9825 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9826 switch(type) {
9827 case T_BOOLEAN:
9828 case T_BYTE:
9829 kxorbl(dst, src1, src2);
9830 break;
9831 case T_CHAR:
9832 case T_SHORT:
9833 kxorwl(dst, src1, src2);
9834 break;
9835 case T_INT:
9836 case T_FLOAT:
9837 kxordl(dst, src1, src2);
9838 break;
9839 case T_LONG:
9840 case T_DOUBLE:
9841 kxorql(dst, src1, src2);
9842 break;
9843 default:
9844 fatal("Unexpected type argument %s", type2name(type));
9845 break;
9846 }
9847 }
9848
9849 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9850 switch(type) {
9851 case T_BOOLEAN:
9852 case T_BYTE:
9853 evpermb(dst, mask, nds, src, merge, vector_len); break;
9854 case T_CHAR:
9855 case T_SHORT:
9856 evpermw(dst, mask, nds, src, merge, vector_len); break;
9857 case T_INT:
9858 case T_FLOAT:
9859 evpermd(dst, mask, nds, src, merge, vector_len); break;
9860 case T_LONG:
9861 case T_DOUBLE:
9862 evpermq(dst, mask, nds, src, merge, vector_len); break;
9863 default:
9864 fatal("Unexpected type argument %s", type2name(type)); break;
9865 }
9866 }
9867
9868 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9869 switch(type) {
9870 case T_BOOLEAN:
9871 case T_BYTE:
9872 evpermb(dst, mask, nds, src, merge, vector_len); break;
9873 case T_CHAR:
9874 case T_SHORT:
9875 evpermw(dst, mask, nds, src, merge, vector_len); break;
9876 case T_INT:
9877 case T_FLOAT:
9878 evpermd(dst, mask, nds, src, merge, vector_len); break;
9879 case T_LONG:
9880 case T_DOUBLE:
9881 evpermq(dst, mask, nds, src, merge, vector_len); break;
9882 default:
9883 fatal("Unexpected type argument %s", type2name(type)); break;
9884 }
9885 }
9886
9887 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9888 switch(type) {
9889 case T_BYTE:
9890 evpminub(dst, mask, nds, src, merge, vector_len); break;
9891 case T_SHORT:
9892 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9893 case T_INT:
9894 evpminud(dst, mask, nds, src, merge, vector_len); break;
9895 case T_LONG:
9896 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9897 default:
9898 fatal("Unexpected type argument %s", type2name(type)); break;
9899 }
9900 }
9901
9902 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9903 switch(type) {
9904 case T_BYTE:
9905 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9906 case T_SHORT:
9907 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9908 case T_INT:
9909 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9910 case T_LONG:
9911 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9912 default:
9913 fatal("Unexpected type argument %s", type2name(type)); break;
9914 }
9915 }
9916
9917 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9918 switch(type) {
9919 case T_BYTE:
9920 evpminub(dst, mask, nds, src, merge, vector_len); break;
9921 case T_SHORT:
9922 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9923 case T_INT:
9924 evpminud(dst, mask, nds, src, merge, vector_len); break;
9925 case T_LONG:
9926 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9927 default:
9928 fatal("Unexpected type argument %s", type2name(type)); break;
9929 }
9930 }
9931
9932 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9933 switch(type) {
9934 case T_BYTE:
9935 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9936 case T_SHORT:
9937 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9938 case T_INT:
9939 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9940 case T_LONG:
9941 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9942 default:
9943 fatal("Unexpected type argument %s", type2name(type)); break;
9944 }
9945 }
9946
9947 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9948 switch(type) {
9949 case T_BYTE:
9950 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9951 case T_SHORT:
9952 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9953 case T_INT:
9954 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9955 case T_LONG:
9956 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9957 case T_FLOAT:
9958 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9959 case T_DOUBLE:
9960 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9961 default:
9962 fatal("Unexpected type argument %s", type2name(type)); break;
9963 }
9964 }
9965
9966 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9967 switch(type) {
9968 case T_BYTE:
9969 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9970 case T_SHORT:
9971 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9972 case T_INT:
9973 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9974 case T_LONG:
9975 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9976 case T_FLOAT:
9977 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9978 case T_DOUBLE:
9979 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9980 default:
9981 fatal("Unexpected type argument %s", type2name(type)); break;
9982 }
9983 }
9984
9985 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9986 switch(type) {
9987 case T_BYTE:
9988 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9989 case T_SHORT:
9990 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9991 case T_INT:
9992 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9993 case T_LONG:
9994 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9995 case T_FLOAT:
9996 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9997 case T_DOUBLE:
9998 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9999 default:
10000 fatal("Unexpected type argument %s", type2name(type)); break;
10001 }
10002 }
10003
10004 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
10005 switch(type) {
10006 case T_BYTE:
10007 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
10008 case T_SHORT:
10009 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
10010 case T_INT:
10011 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
10012 case T_LONG:
10013 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
10014 case T_FLOAT:
10015 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
10016 case T_DOUBLE:
10017 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
10018 default:
10019 fatal("Unexpected type argument %s", type2name(type)); break;
10020 }
10021 }
10022
10023 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
10024 switch(type) {
10025 case T_INT:
10026 evpxord(dst, mask, nds, src, merge, vector_len); break;
10027 case T_LONG:
10028 evpxorq(dst, mask, nds, src, merge, vector_len); break;
10029 default:
10030 fatal("Unexpected type argument %s", type2name(type)); break;
10031 }
10032 }
10033
10034 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
10035 switch(type) {
10036 case T_INT:
10037 evpxord(dst, mask, nds, src, merge, vector_len); break;
10038 case T_LONG:
10039 evpxorq(dst, mask, nds, src, merge, vector_len); break;
10040 default:
10041 fatal("Unexpected type argument %s", type2name(type)); break;
10042 }
10043 }
10044
10045 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
10046 switch(type) {
10047 case T_INT:
10048 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
10049 case T_LONG:
10050 evporq(dst, mask, nds, src, merge, vector_len); break;
10051 default:
10052 fatal("Unexpected type argument %s", type2name(type)); break;
10053 }
10054 }
10055
10056 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
10057 switch(type) {
10058 case T_INT:
10059 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
10060 case T_LONG:
10061 evporq(dst, mask, nds, src, merge, vector_len); break;
10062 default:
10063 fatal("Unexpected type argument %s", type2name(type)); break;
10064 }
10065 }
10066
10067 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
10068 switch(type) {
10069 case T_INT:
10070 evpandd(dst, mask, nds, src, merge, vector_len); break;
10071 case T_LONG:
10072 evpandq(dst, mask, nds, src, merge, vector_len); break;
10073 default:
10074 fatal("Unexpected type argument %s", type2name(type)); break;
10075 }
10076 }
10077
10078 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
10079 switch(type) {
10080 case T_INT:
10081 evpandd(dst, mask, nds, src, merge, vector_len); break;
10082 case T_LONG:
10083 evpandq(dst, mask, nds, src, merge, vector_len); break;
10084 default:
10085 fatal("Unexpected type argument %s", type2name(type)); break;
10086 }
10087 }
10088
10089 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
10090 switch(masklen) {
10091 case 8:
10092 kortestbl(src1, src2);
10093 break;
10094 case 16:
10095 kortestwl(src1, src2);
10096 break;
10097 case 32:
10098 kortestdl(src1, src2);
10099 break;
10100 case 64:
10101 kortestql(src1, src2);
10102 break;
10103 default:
10104 fatal("Unexpected mask length %d", masklen);
10105 break;
10106 }
10107 }
10108
10109
10110 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
10111 switch(masklen) {
10112 case 8:
10113 ktestbl(src1, src2);
10114 break;
10115 case 16:
10116 ktestwl(src1, src2);
10117 break;
10118 case 32:
10119 ktestdl(src1, src2);
10120 break;
10121 case 64:
10122 ktestql(src1, src2);
10123 break;
10124 default:
10125 fatal("Unexpected mask length %d", masklen);
10126 break;
10127 }
10128 }
10129
10130 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
10131 switch(type) {
10132 case T_INT:
10133 evprold(dst, mask, src, shift, merge, vlen_enc); break;
10134 case T_LONG:
10135 evprolq(dst, mask, src, shift, merge, vlen_enc); break;
10136 default:
10137 fatal("Unexpected type argument %s", type2name(type)); break;
10138 break;
10139 }
10140 }
10141
10142 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
10143 switch(type) {
10144 case T_INT:
10145 evprord(dst, mask, src, shift, merge, vlen_enc); break;
10146 case T_LONG:
10147 evprorq(dst, mask, src, shift, merge, vlen_enc); break;
10148 default:
10149 fatal("Unexpected type argument %s", type2name(type)); break;
10150 }
10151 }
10152
10153 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
10154 switch(type) {
10155 case T_INT:
10156 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
10157 case T_LONG:
10158 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
10159 default:
10160 fatal("Unexpected type argument %s", type2name(type)); break;
10161 }
10162 }
10163
10164 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
10165 switch(type) {
10166 case T_INT:
10167 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
10168 case T_LONG:
10169 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
10170 default:
10171 fatal("Unexpected type argument %s", type2name(type)); break;
10172 }
10173 }
10174
10175 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
10176 assert(rscratch != noreg || always_reachable(src), "missing");
10177
10178 if (reachable(src)) {
10179 evpandq(dst, nds, as_Address(src), vector_len);
10180 } else {
10181 lea(rscratch, src);
10182 evpandq(dst, nds, Address(rscratch, 0), vector_len);
10183 }
10184 }
10185
10186 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
10187 assert(rscratch != noreg || always_reachable(src), "missing");
10188
10189 if (reachable(src)) {
10190 Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
10191 } else {
10192 lea(rscratch, src);
10193 Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
10194 }
10195 }
10196
10197 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
10198 assert(rscratch != noreg || always_reachable(src), "missing");
10199
10200 if (reachable(src)) {
10201 evporq(dst, nds, as_Address(src), vector_len);
10202 } else {
10203 lea(rscratch, src);
10204 evporq(dst, nds, Address(rscratch, 0), vector_len);
10205 }
10206 }
10207
10208 void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
10209 assert(rscratch != noreg || always_reachable(src), "missing");
10210
10211 if (reachable(src)) {
10212 vpshufb(dst, nds, as_Address(src), vector_len);
10213 } else {
10214 lea(rscratch, src);
10215 vpshufb(dst, nds, Address(rscratch, 0), vector_len);
10216 }
10217 }
10218
10219 void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
10220 assert(rscratch != noreg || always_reachable(src), "missing");
10221
10222 if (reachable(src)) {
10223 Assembler::vpor(dst, nds, as_Address(src), vector_len);
10224 } else {
10225 lea(rscratch, src);
10226 Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len);
10227 }
10228 }
10229
10230 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
10231 assert(rscratch != noreg || always_reachable(src3), "missing");
10232
10233 if (reachable(src3)) {
10234 vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
10235 } else {
10236 lea(rscratch, src3);
10237 vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
10238 }
10239 }
10240
10241 #if COMPILER2_OR_JVMCI
10242
10243 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
10244 Register length, Register temp, int vec_enc) {
10245 // Computing mask for predicated vector store.
10246 movptr(temp, -1);
10247 bzhiq(temp, temp, length);
10248 kmov(mask, temp);
10249 evmovdqu(bt, mask, dst, xmm, true, vec_enc);
10250 }
10251
10252 // Set memory operation for length "less than" 64 bytes.
10253 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
10254 XMMRegister xmm, KRegister mask, Register length,
10255 Register temp, bool use64byteVector) {
10256 assert(MaxVectorSize >= 32, "vector length should be >= 32");
10257 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
10258 if (!use64byteVector) {
10259 fill32(dst, disp, xmm);
10260 subptr(length, 32 >> shift);
10261 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
10262 } else {
10263 assert(MaxVectorSize == 64, "vector length != 64");
10264 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
10265 }
10266 }
10267
10268
10269 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
10270 XMMRegister xmm, KRegister mask, Register length,
10271 Register temp) {
10272 assert(MaxVectorSize >= 32, "vector length should be >= 32");
10273 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
10274 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
10275 }
10276
10277
10278 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
10279 assert(MaxVectorSize >= 32, "vector length should be >= 32");
10280 vmovdqu(dst, xmm);
10281 }
10282
10283 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
10284 fill32(Address(dst, disp), xmm);
10285 }
10286
10287 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
10288 assert(MaxVectorSize >= 32, "vector length should be >= 32");
10289 if (!use64byteVector) {
10290 fill32(dst, xmm);
10291 fill32(dst.plus_disp(32), xmm);
10292 } else {
10293 evmovdquq(dst, xmm, Assembler::AVX_512bit);
10294 }
10295 }
10296
10297 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
10298 fill64(Address(dst, disp), xmm, use64byteVector);
10299 }
10300
10301 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
10302 Register count, Register rtmp, XMMRegister xtmp) {
10303 Label L_exit;
10304 Label L_fill_start;
10305 Label L_fill_64_bytes;
10306 Label L_fill_96_bytes;
10307 Label L_fill_128_bytes;
10308 Label L_fill_128_bytes_loop;
10309 Label L_fill_128_loop_header;
10310 Label L_fill_128_bytes_loop_header;
10311 Label L_fill_128_bytes_loop_pre_header;
10312 Label L_fill_zmm_sequence;
10313
10314 int shift = -1;
10315 switch(type) {
10316 case T_BYTE: shift = 0;
10317 break;
10318 case T_SHORT: shift = 1;
10319 break;
10320 case T_INT: shift = 2;
10321 break;
10322 /* Uncomment when LONG fill stubs are supported.
10323 case T_LONG: shift = 3;
10324 break;
10325 */
10326 default:
10327 fatal("Unhandled type: %s\n", type2name(type));
10328 }
10329
10330 if ((CopyAVX3Threshold != 0) || (MaxVectorSize == 32)) {
10331
10332 if (MaxVectorSize == 64) {
10333 cmpq(count, CopyAVX3Threshold >> shift);
10334 jcc(Assembler::greater, L_fill_zmm_sequence);
10335 }
10336
10337 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
10338
10339 bind(L_fill_start);
10340
10341 cmpq(count, 32 >> shift);
10342 jccb(Assembler::greater, L_fill_64_bytes);
10343 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
10344 jmp(L_exit);
10345
10346 bind(L_fill_64_bytes);
10347 cmpq(count, 64 >> shift);
10348 jccb(Assembler::greater, L_fill_96_bytes);
10349 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
10350 jmp(L_exit);
10351
10352 bind(L_fill_96_bytes);
10353 cmpq(count, 96 >> shift);
10354 jccb(Assembler::greater, L_fill_128_bytes);
10355 fill64(to, 0, xtmp);
10356 subq(count, 64 >> shift);
10357 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
10358 jmp(L_exit);
10359
10360 bind(L_fill_128_bytes);
10361 cmpq(count, 128 >> shift);
10362 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
10363 fill64(to, 0, xtmp);
10364 fill32(to, 64, xtmp);
10365 subq(count, 96 >> shift);
10366 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
10367 jmp(L_exit);
10368
10369 bind(L_fill_128_bytes_loop_pre_header);
10370 {
10371 mov(rtmp, to);
10372 andq(rtmp, 31);
10373 jccb(Assembler::zero, L_fill_128_bytes_loop_header);
10374 negq(rtmp);
10375 addq(rtmp, 32);
10376 mov64(r8, -1L);
10377 bzhiq(r8, r8, rtmp);
10378 kmovql(k2, r8);
10379 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
10380 addq(to, rtmp);
10381 shrq(rtmp, shift);
10382 subq(count, rtmp);
10383 }
10384
10385 cmpq(count, 128 >> shift);
10386 jcc(Assembler::less, L_fill_start);
10387
10388 bind(L_fill_128_bytes_loop_header);
10389 subq(count, 128 >> shift);
10390
10391 align32();
10392 bind(L_fill_128_bytes_loop);
10393 fill64(to, 0, xtmp);
10394 fill64(to, 64, xtmp);
10395 addq(to, 128);
10396 subq(count, 128 >> shift);
10397 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
10398
10399 addq(count, 128 >> shift);
10400 jcc(Assembler::zero, L_exit);
10401 jmp(L_fill_start);
10402 }
10403
10404 if (MaxVectorSize == 64) {
10405 // Sequence using 64 byte ZMM register.
10406 Label L_fill_128_bytes_zmm;
10407 Label L_fill_192_bytes_zmm;
10408 Label L_fill_192_bytes_loop_zmm;
10409 Label L_fill_192_bytes_loop_header_zmm;
10410 Label L_fill_192_bytes_loop_pre_header_zmm;
10411 Label L_fill_start_zmm_sequence;
10412
10413 bind(L_fill_zmm_sequence);
10414 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
10415
10416 bind(L_fill_start_zmm_sequence);
10417 cmpq(count, 64 >> shift);
10418 jccb(Assembler::greater, L_fill_128_bytes_zmm);
10419 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
10420 jmp(L_exit);
10421
10422 bind(L_fill_128_bytes_zmm);
10423 cmpq(count, 128 >> shift);
10424 jccb(Assembler::greater, L_fill_192_bytes_zmm);
10425 fill64(to, 0, xtmp, true);
10426 subq(count, 64 >> shift);
10427 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
10428 jmp(L_exit);
10429
10430 bind(L_fill_192_bytes_zmm);
10431 cmpq(count, 192 >> shift);
10432 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
10433 fill64(to, 0, xtmp, true);
10434 fill64(to, 64, xtmp, true);
10435 subq(count, 128 >> shift);
10436 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
10437 jmp(L_exit);
10438
10439 bind(L_fill_192_bytes_loop_pre_header_zmm);
10440 {
10441 movq(rtmp, to);
10442 andq(rtmp, 63);
10443 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
10444 negq(rtmp);
10445 addq(rtmp, 64);
10446 mov64(r8, -1L);
10447 bzhiq(r8, r8, rtmp);
10448 kmovql(k2, r8);
10449 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
10450 addq(to, rtmp);
10451 shrq(rtmp, shift);
10452 subq(count, rtmp);
10453 }
10454
10455 cmpq(count, 192 >> shift);
10456 jcc(Assembler::less, L_fill_start_zmm_sequence);
10457
10458 bind(L_fill_192_bytes_loop_header_zmm);
10459 subq(count, 192 >> shift);
10460
10461 align32();
10462 bind(L_fill_192_bytes_loop_zmm);
10463 fill64(to, 0, xtmp, true);
10464 fill64(to, 64, xtmp, true);
10465 fill64(to, 128, xtmp, true);
10466 addq(to, 192);
10467 subq(count, 192 >> shift);
10468 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
10469
10470 addq(count, 192 >> shift);
10471 jcc(Assembler::zero, L_exit);
10472 jmp(L_fill_start_zmm_sequence);
10473 }
10474 bind(L_exit);
10475 }
10476 #endif //COMPILER2_OR_JVMCI
10477
10478
10479 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
10480 Label done;
10481 cvttss2sil(dst, src);
10482 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
10483 cmpl(dst, 0x80000000); // float_sign_flip
10484 jccb(Assembler::notEqual, done);
10485 subptr(rsp, 8);
10486 movflt(Address(rsp, 0), src);
10487 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
10488 pop(dst);
10489 bind(done);
10490 }
10491
10492 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
10493 Label done;
10494 cvttsd2sil(dst, src);
10495 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
10496 cmpl(dst, 0x80000000); // float_sign_flip
10497 jccb(Assembler::notEqual, done);
10498 subptr(rsp, 8);
10499 movdbl(Address(rsp, 0), src);
10500 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
10501 pop(dst);
10502 bind(done);
10503 }
10504
10505 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
10506 Label done;
10507 cvttss2siq(dst, src);
10508 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
10509 jccb(Assembler::notEqual, done);
10510 subptr(rsp, 8);
10511 movflt(Address(rsp, 0), src);
10512 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
10513 pop(dst);
10514 bind(done);
10515 }
10516
10517 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
10518 // Following code is line by line assembly translation rounding algorithm.
10519 // Please refer to java.lang.Math.round(float) algorithm for details.
10520 const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
10521 const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
10522 const int32_t FloatConsts_EXP_BIAS = 127;
10523 const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
10524 const int32_t MINUS_32 = 0xFFFFFFE0;
10525 Label L_special_case, L_block1, L_exit;
10526 movl(rtmp, FloatConsts_EXP_BIT_MASK);
10527 movdl(dst, src);
10528 andl(dst, rtmp);
10529 sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
10530 movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
10531 subl(rtmp, dst);
10532 movl(rcx, rtmp);
10533 movl(dst, MINUS_32);
10534 testl(rtmp, dst);
10535 jccb(Assembler::notEqual, L_special_case);
10536 movdl(dst, src);
10537 andl(dst, FloatConsts_SIGNIF_BIT_MASK);
10538 orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
10539 movdl(rtmp, src);
10540 testl(rtmp, rtmp);
10541 jccb(Assembler::greaterEqual, L_block1);
10542 negl(dst);
10543 bind(L_block1);
10544 sarl(dst);
10545 addl(dst, 0x1);
10546 sarl(dst, 0x1);
10547 jmp(L_exit);
10548 bind(L_special_case);
10549 convert_f2i(dst, src);
10550 bind(L_exit);
10551 }
10552
10553 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
10554 // Following code is line by line assembly translation rounding algorithm.
10555 // Please refer to java.lang.Math.round(double) algorithm for details.
10556 const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
10557 const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
10558 const int64_t DoubleConsts_EXP_BIAS = 1023;
10559 const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
10560 const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
10561 Label L_special_case, L_block1, L_exit;
10562 mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
10563 movq(dst, src);
10564 andq(dst, rtmp);
10565 sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
10566 mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
10567 subq(rtmp, dst);
10568 movq(rcx, rtmp);
10569 mov64(dst, MINUS_64);
10570 testq(rtmp, dst);
10571 jccb(Assembler::notEqual, L_special_case);
10572 movq(dst, src);
10573 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
10574 andq(dst, rtmp);
10575 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
10576 orq(dst, rtmp);
10577 movq(rtmp, src);
10578 testq(rtmp, rtmp);
10579 jccb(Assembler::greaterEqual, L_block1);
10580 negq(dst);
10581 bind(L_block1);
10582 sarq(dst);
10583 addq(dst, 0x1);
10584 sarq(dst, 0x1);
10585 jmp(L_exit);
10586 bind(L_special_case);
10587 convert_d2l(dst, src);
10588 bind(L_exit);
10589 }
10590
10591 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
10592 Label done;
10593 cvttsd2siq(dst, src);
10594 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
10595 jccb(Assembler::notEqual, done);
10596 subptr(rsp, 8);
10597 movdbl(Address(rsp, 0), src);
10598 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
10599 pop(dst);
10600 bind(done);
10601 }
10602
10603 void MacroAssembler::cache_wb(Address line)
10604 {
10605 // 64 bit cpus always support clflush
10606 assert(VM_Version::supports_clflush(), "clflush should be available");
10607 bool optimized = VM_Version::supports_clflushopt();
10608 bool no_evict = VM_Version::supports_clwb();
10609
10610 // prefer clwb (writeback without evict) otherwise
10611 // prefer clflushopt (potentially parallel writeback with evict)
10612 // otherwise fallback on clflush (serial writeback with evict)
10613
10614 if (optimized) {
10615 if (no_evict) {
10616 clwb(line);
10617 } else {
10618 clflushopt(line);
10619 }
10620 } else {
10621 // no need for fence when using CLFLUSH
10622 clflush(line);
10623 }
10624 }
10625
10626 void MacroAssembler::cache_wbsync(bool is_pre)
10627 {
10628 assert(VM_Version::supports_clflush(), "clflush should be available");
10629 bool optimized = VM_Version::supports_clflushopt();
10630 bool no_evict = VM_Version::supports_clwb();
10631
10632 // pick the correct implementation
10633
10634 if (!is_pre && (optimized || no_evict)) {
10635 // need an sfence for post flush when using clflushopt or clwb
10636 // otherwise no no need for any synchroniaztion
10637
10638 sfence();
10639 }
10640 }
10641
10642 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10643 switch (cond) {
10644 // Note some conditions are synonyms for others
10645 case Assembler::zero: return Assembler::notZero;
10646 case Assembler::notZero: return Assembler::zero;
10647 case Assembler::less: return Assembler::greaterEqual;
10648 case Assembler::lessEqual: return Assembler::greater;
10649 case Assembler::greater: return Assembler::lessEqual;
10650 case Assembler::greaterEqual: return Assembler::less;
10651 case Assembler::below: return Assembler::aboveEqual;
10652 case Assembler::belowEqual: return Assembler::above;
10653 case Assembler::above: return Assembler::belowEqual;
10654 case Assembler::aboveEqual: return Assembler::below;
10655 case Assembler::overflow: return Assembler::noOverflow;
10656 case Assembler::noOverflow: return Assembler::overflow;
10657 case Assembler::negative: return Assembler::positive;
10658 case Assembler::positive: return Assembler::negative;
10659 case Assembler::parity: return Assembler::noParity;
10660 case Assembler::noParity: return Assembler::parity;
10661 }
10662 ShouldNotReachHere(); return Assembler::overflow;
10663 }
10664
10665 // This is simply a call to Thread::current()
10666 void MacroAssembler::get_thread_slow(Register thread) {
10667 if (thread != rax) {
10668 push(rax);
10669 }
10670 push(rdi);
10671 push(rsi);
10672 push(rdx);
10673 push(rcx);
10674 push(r8);
10675 push(r9);
10676 push(r10);
10677 push(r11);
10678
10679 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10680
10681 pop(r11);
10682 pop(r10);
10683 pop(r9);
10684 pop(r8);
10685 pop(rcx);
10686 pop(rdx);
10687 pop(rsi);
10688 pop(rdi);
10689 if (thread != rax) {
10690 mov(thread, rax);
10691 pop(rax);
10692 }
10693 }
10694
10695 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
10696 Label L_stack_ok;
10697 if (bias == 0) {
10698 testptr(sp, 2 * wordSize - 1);
10699 } else {
10700 // lea(tmp, Address(rsp, bias);
10701 mov(tmp, sp);
10702 addptr(tmp, bias);
10703 testptr(tmp, 2 * wordSize - 1);
10704 }
10705 jcc(Assembler::equal, L_stack_ok);
10706 block_comment(msg);
10707 stop(msg);
10708 bind(L_stack_ok);
10709 }
10710
10711 // Implements fast-locking.
10712 //
10713 // obj: the object to be locked
10714 // reg_rax: rax
10715 // thread: the thread which attempts to lock obj
10716 // tmp: a temporary register
10717 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register reg_rax, Register tmp, Label& slow) {
10718 Register thread = r15_thread;
10719
10720 assert(reg_rax == rax, "");
10721 assert_different_registers(basic_lock, obj, reg_rax, thread, tmp);
10722
10723 Label push;
10724 const Register top = tmp;
10725
10726 // Preload the markWord. It is important that this is the first
10727 // instruction emitted as it is part of C1's null check semantics.
10728 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10729
10730 if (UseObjectMonitorTable) {
10731 // Clear cache in case fast locking succeeds or we need to take the slow-path.
10732 movptr(Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))), 0);
10733 }
10734
10735 if (DiagnoseSyncOnValueBasedClasses != 0) {
10736 load_klass(tmp, obj, rscratch1);
10737 testb(Address(tmp, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
10738 jcc(Assembler::notZero, slow);
10739 }
10740
10741 // Load top.
10742 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10743
10744 // Check if the lock-stack is full.
10745 cmpl(top, LockStack::end_offset());
10746 jcc(Assembler::greaterEqual, slow);
10747
10748 // Check for recursion.
10749 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
10750 jcc(Assembler::equal, push);
10751
10752 // Check header for monitor (0b10).
10753 testptr(reg_rax, markWord::monitor_value);
10754 jcc(Assembler::notZero, slow);
10755
10756 // Try to lock. Transition lock bits 0b01 => 0b00
10757 movptr(tmp, reg_rax);
10758 andptr(tmp, ~(int32_t)markWord::unlocked_value);
10759 orptr(reg_rax, markWord::unlocked_value);
10760 // Mask inline_type bit such that we go to the slow path if object is an inline type
10761 andptr(reg_rax, ~((int) markWord::inline_type_bit_in_place));
10762
10763 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10764 jcc(Assembler::notEqual, slow);
10765
10766 // Restore top, CAS clobbers register.
10767 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10768
10769 bind(push);
10770 // After successful lock, push object on lock-stack.
10771 movptr(Address(thread, top), obj);
10772 incrementl(top, oopSize);
10773 movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
10774 }
10775
10776 // Implements fast-unlocking.
10777 //
10778 // obj: the object to be unlocked
10779 // reg_rax: rax
10780 // thread: the thread
10781 // tmp: a temporary register
10782 void MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register tmp, Label& slow) {
10783 Register thread = r15_thread;
10784
10785 assert(reg_rax == rax, "");
10786 assert_different_registers(obj, reg_rax, thread, tmp);
10787
10788 Label unlocked, push_and_slow;
10789 const Register top = tmp;
10790
10791 // Check if obj is top of lock-stack.
10792 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10793 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
10794 jcc(Assembler::notEqual, slow);
10795
10796 // Pop lock-stack.
10797 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
10798 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10799
10800 // Check if recursive.
10801 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
10802 jcc(Assembler::equal, unlocked);
10803
10804 // Not recursive. Check header for monitor (0b10).
10805 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10806 testptr(reg_rax, markWord::monitor_value);
10807 jcc(Assembler::notZero, push_and_slow);
10808
10809 #ifdef ASSERT
10810 // Check header not unlocked (0b01).
10811 Label not_unlocked;
10812 testptr(reg_rax, markWord::unlocked_value);
10813 jcc(Assembler::zero, not_unlocked);
10814 stop("fast_unlock already unlocked");
10815 bind(not_unlocked);
10816 #endif
10817
10818 // Try to unlock. Transition lock bits 0b00 => 0b01
10819 movptr(tmp, reg_rax);
10820 orptr(tmp, markWord::unlocked_value);
10821 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10822 jcc(Assembler::equal, unlocked);
10823
10824 bind(push_and_slow);
10825 // Restore lock-stack and handle the unlock in runtime.
10826 #ifdef ASSERT
10827 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10828 movptr(Address(thread, top), obj);
10829 #endif
10830 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10831 jmp(slow);
10832
10833 bind(unlocked);
10834 }
10835
10836 // Saves legacy GPRs state on stack.
10837 void MacroAssembler::save_legacy_gprs() {
10838 subq(rsp, 16 * wordSize);
10839 movq(Address(rsp, 15 * wordSize), rax);
10840 movq(Address(rsp, 14 * wordSize), rcx);
10841 movq(Address(rsp, 13 * wordSize), rdx);
10842 movq(Address(rsp, 12 * wordSize), rbx);
10843 movq(Address(rsp, 10 * wordSize), rbp);
10844 movq(Address(rsp, 9 * wordSize), rsi);
10845 movq(Address(rsp, 8 * wordSize), rdi);
10846 movq(Address(rsp, 7 * wordSize), r8);
10847 movq(Address(rsp, 6 * wordSize), r9);
10848 movq(Address(rsp, 5 * wordSize), r10);
10849 movq(Address(rsp, 4 * wordSize), r11);
10850 movq(Address(rsp, 3 * wordSize), r12);
10851 movq(Address(rsp, 2 * wordSize), r13);
10852 movq(Address(rsp, wordSize), r14);
10853 movq(Address(rsp, 0), r15);
10854 }
10855
10856 // Resotres back legacy GPRs state from stack.
10857 void MacroAssembler::restore_legacy_gprs() {
10858 movq(r15, Address(rsp, 0));
10859 movq(r14, Address(rsp, wordSize));
10860 movq(r13, Address(rsp, 2 * wordSize));
10861 movq(r12, Address(rsp, 3 * wordSize));
10862 movq(r11, Address(rsp, 4 * wordSize));
10863 movq(r10, Address(rsp, 5 * wordSize));
10864 movq(r9, Address(rsp, 6 * wordSize));
10865 movq(r8, Address(rsp, 7 * wordSize));
10866 movq(rdi, Address(rsp, 8 * wordSize));
10867 movq(rsi, Address(rsp, 9 * wordSize));
10868 movq(rbp, Address(rsp, 10 * wordSize));
10869 movq(rbx, Address(rsp, 12 * wordSize));
10870 movq(rdx, Address(rsp, 13 * wordSize));
10871 movq(rcx, Address(rsp, 14 * wordSize));
10872 movq(rax, Address(rsp, 15 * wordSize));
10873 addq(rsp, 16 * wordSize);
10874 }
10875
10876 void MacroAssembler::load_aotrc_address(Register reg, address a) {
10877 #if INCLUDE_CDS
10878 assert(AOTRuntimeConstants::contains(a), "address out of range for data area");
10879 if (AOTCodeCache::is_on_for_dump()) {
10880 // all aotrc field addresses should be registered in the AOTCodeCache address table
10881 lea(reg, ExternalAddress(a));
10882 } else {
10883 mov64(reg, (uint64_t)a);
10884 }
10885 #else
10886 ShouldNotReachHere();
10887 #endif
10888 }
10889
10890 void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) {
10891 if (VM_Version::supports_apx_f()) {
10892 esetzucc(comparison, dst);
10893 } else {
10894 setb(comparison, dst);
10895 movzbl(dst, dst);
10896 }
10897 }