1 /*
2 * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "code/aotCodeCache.hpp"
28 #include "code/compiledIC.hpp"
29 #include "compiler/compiler_globals.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "ci/ciInlineKlass.hpp"
32 #include "crc32c.h"
33 #include "gc/shared/barrierSet.hpp"
34 #include "gc/shared/barrierSetAssembler.hpp"
35 #include "gc/shared/collectedHeap.inline.hpp"
36 #include "gc/shared/tlab_globals.hpp"
37 #include "interpreter/bytecodeHistogram.hpp"
38 #include "interpreter/interpreter.hpp"
39 #include "interpreter/interpreterRuntime.hpp"
40 #include "jvm.h"
41 #include "memory/resourceArea.hpp"
42 #include "memory/universe.hpp"
43 #include "oops/accessDecorators.hpp"
44 #include "oops/compressedKlass.inline.hpp"
45 #include "oops/compressedOops.inline.hpp"
46 #include "oops/klass.inline.hpp"
47 #include "oops/resolvedFieldEntry.hpp"
48 #include "prims/methodHandles.hpp"
49 #include "runtime/continuation.hpp"
50 #include "runtime/interfaceSupport.inline.hpp"
51 #include "runtime/javaThread.hpp"
52 #include "runtime/jniHandles.hpp"
53 #include "runtime/objectMonitor.hpp"
54 #include "runtime/os.hpp"
55 #include "runtime/safepoint.hpp"
56 #include "runtime/safepointMechanism.hpp"
57 #include "runtime/sharedRuntime.hpp"
58 #include "runtime/signature_cc.hpp"
59 #include "runtime/stubRoutines.hpp"
60 #include "utilities/checkedCast.hpp"
61 #include "utilities/macros.hpp"
62 #include "vmreg_x86.inline.hpp"
63 #ifdef COMPILER2
64 #include "opto/output.hpp"
65 #endif
66
67 #ifdef PRODUCT
68 #define BLOCK_COMMENT(str) /* nothing */
69 #define STOP(error) stop(error)
70 #else
71 #define BLOCK_COMMENT(str) block_comment(str)
72 #define STOP(error) block_comment(error); stop(error)
73 #endif
74
75 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
76
77 #ifdef ASSERT
78 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
79 #endif
80
81 static const Assembler::Condition reverse[] = {
82 Assembler::noOverflow /* overflow = 0x0 */ ,
83 Assembler::overflow /* noOverflow = 0x1 */ ,
84 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
85 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
86 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
87 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
88 Assembler::above /* belowEqual = 0x6 */ ,
89 Assembler::belowEqual /* above = 0x7 */ ,
90 Assembler::positive /* negative = 0x8 */ ,
91 Assembler::negative /* positive = 0x9 */ ,
92 Assembler::noParity /* parity = 0xa */ ,
93 Assembler::parity /* noParity = 0xb */ ,
94 Assembler::greaterEqual /* less = 0xc */ ,
95 Assembler::less /* greaterEqual = 0xd */ ,
96 Assembler::greater /* lessEqual = 0xe */ ,
97 Assembler::lessEqual /* greater = 0xf, */
98
99 };
100
101
102 // Implementation of MacroAssembler
103
104 Address MacroAssembler::as_Address(AddressLiteral adr) {
105 // amd64 always does this as a pc-rel
106 // we can be absolute or disp based on the instruction type
107 // jmp/call are displacements others are absolute
108 assert(!adr.is_lval(), "must be rval");
109 assert(reachable(adr), "must be");
110 return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
111
112 }
113
114 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
115 AddressLiteral base = adr.base();
116 lea(rscratch, base);
117 Address index = adr.index();
118 assert(index._disp == 0, "must not have disp"); // maybe it can?
119 Address array(rscratch, index._index, index._scale, index._disp);
120 return array;
121 }
122
123 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
124 Label L, E;
125
126 #ifdef _WIN64
127 // Windows always allocates space for it's register args
128 assert(num_args <= 4, "only register arguments supported");
129 subq(rsp, frame::arg_reg_save_area_bytes);
130 #endif
131
132 // Align stack if necessary
133 testl(rsp, 15);
134 jcc(Assembler::zero, L);
135
136 subq(rsp, 8);
137 call(RuntimeAddress(entry_point));
138 addq(rsp, 8);
139 jmp(E);
140
141 bind(L);
142 call(RuntimeAddress(entry_point));
143
144 bind(E);
145
146 #ifdef _WIN64
147 // restore stack pointer
148 addq(rsp, frame::arg_reg_save_area_bytes);
149 #endif
150 }
151
152 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
153 assert(!src2.is_lval(), "should use cmpptr");
154 assert(rscratch != noreg || always_reachable(src2), "missing");
155
156 if (reachable(src2)) {
157 cmpq(src1, as_Address(src2));
158 } else {
159 lea(rscratch, src2);
160 Assembler::cmpq(src1, Address(rscratch, 0));
161 }
162 }
163
164 int MacroAssembler::corrected_idivq(Register reg) {
165 // Full implementation of Java ldiv and lrem; checks for special
166 // case as described in JVM spec., p.243 & p.271. The function
167 // returns the (pc) offset of the idivl instruction - may be needed
168 // for implicit exceptions.
169 //
170 // normal case special case
171 //
172 // input : rax: dividend min_long
173 // reg: divisor (may not be eax/edx) -1
174 //
175 // output: rax: quotient (= rax idiv reg) min_long
176 // rdx: remainder (= rax irem reg) 0
177 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
178 static const int64_t min_long = 0x8000000000000000;
179 Label normal_case, special_case;
180
181 // check for special case
182 cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
183 jcc(Assembler::notEqual, normal_case);
184 xorl(rdx, rdx); // prepare rdx for possible special case (where
185 // remainder = 0)
186 cmpq(reg, -1);
187 jcc(Assembler::equal, special_case);
188
189 // handle normal case
190 bind(normal_case);
191 cdqq();
192 int idivq_offset = offset();
193 idivq(reg);
194
195 // normal and special case exit
196 bind(special_case);
197
198 return idivq_offset;
199 }
200
201 void MacroAssembler::decrementq(Register reg, int value) {
202 if (value == min_jint) { subq(reg, value); return; }
203 if (value < 0) { incrementq(reg, -value); return; }
204 if (value == 0) { ; return; }
205 if (value == 1 && UseIncDec) { decq(reg) ; return; }
206 /* else */ { subq(reg, value) ; return; }
207 }
208
209 void MacroAssembler::decrementq(Address dst, int value) {
210 if (value == min_jint) { subq(dst, value); return; }
211 if (value < 0) { incrementq(dst, -value); return; }
212 if (value == 0) { ; return; }
213 if (value == 1 && UseIncDec) { decq(dst) ; return; }
214 /* else */ { subq(dst, value) ; return; }
215 }
216
217 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
218 assert(rscratch != noreg || always_reachable(dst), "missing");
219
220 if (reachable(dst)) {
221 incrementq(as_Address(dst));
222 } else {
223 lea(rscratch, dst);
224 incrementq(Address(rscratch, 0));
225 }
226 }
227
228 void MacroAssembler::incrementq(Register reg, int value) {
229 if (value == min_jint) { addq(reg, value); return; }
230 if (value < 0) { decrementq(reg, -value); return; }
231 if (value == 0) { ; return; }
232 if (value == 1 && UseIncDec) { incq(reg) ; return; }
233 /* else */ { addq(reg, value) ; return; }
234 }
235
236 void MacroAssembler::incrementq(Address dst, int value) {
237 if (value == min_jint) { addq(dst, value); return; }
238 if (value < 0) { decrementq(dst, -value); return; }
239 if (value == 0) { ; return; }
240 if (value == 1 && UseIncDec) { incq(dst) ; return; }
241 /* else */ { addq(dst, value) ; return; }
242 }
243
244 // 32bit can do a case table jump in one instruction but we no longer allow the base
245 // to be installed in the Address class
246 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
247 lea(rscratch, entry.base());
248 Address dispatch = entry.index();
249 assert(dispatch._base == noreg, "must be");
250 dispatch._base = rscratch;
251 jmp(dispatch);
252 }
253
254 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
255 ShouldNotReachHere(); // 64bit doesn't use two regs
256 cmpq(x_lo, y_lo);
257 }
258
259 void MacroAssembler::lea(Register dst, AddressLiteral src) {
260 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
261 }
262
263 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
264 lea(rscratch, adr);
265 movptr(dst, rscratch);
266 }
267
268 void MacroAssembler::leave() {
269 // %%% is this really better? Why not on 32bit too?
270 emit_int8((unsigned char)0xC9); // LEAVE
271 }
272
273 void MacroAssembler::lneg(Register hi, Register lo) {
274 ShouldNotReachHere(); // 64bit doesn't use two regs
275 negq(lo);
276 }
277
278 void MacroAssembler::movoop(Register dst, jobject obj) {
279 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
280 }
281
282 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
283 mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
284 movq(dst, rscratch);
285 }
286
287 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
288 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
289 }
290
291 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
292 mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
293 movq(dst, rscratch);
294 }
295
296 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
297 if (src.is_lval()) {
298 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
299 } else {
300 if (reachable(src)) {
301 movq(dst, as_Address(src));
302 } else {
303 lea(dst, src);
304 movq(dst, Address(dst, 0));
305 }
306 }
307 }
308
309 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
310 movq(as_Address(dst, rscratch), src);
311 }
312
313 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
314 movq(dst, as_Address(src, dst /*rscratch*/));
315 }
316
317 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
318 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
319 if (is_simm32(src)) {
320 movptr(dst, checked_cast<int32_t>(src));
321 } else {
322 mov64(rscratch, src);
323 movq(dst, rscratch);
324 }
325 }
326
327 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
328 movoop(rscratch, obj);
329 push(rscratch);
330 }
331
332 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
333 mov_metadata(rscratch, obj);
334 push(rscratch);
335 }
336
337 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
338 lea(rscratch, src);
339 if (src.is_lval()) {
340 push(rscratch);
341 } else {
342 pushq(Address(rscratch, 0));
343 }
344 }
345
346 static void pass_arg0(MacroAssembler* masm, Register arg) {
347 if (c_rarg0 != arg ) {
348 masm->mov(c_rarg0, arg);
349 }
350 }
351
352 static void pass_arg1(MacroAssembler* masm, Register arg) {
353 if (c_rarg1 != arg ) {
354 masm->mov(c_rarg1, arg);
355 }
356 }
357
358 static void pass_arg2(MacroAssembler* masm, Register arg) {
359 if (c_rarg2 != arg ) {
360 masm->mov(c_rarg2, arg);
361 }
362 }
363
364 static void pass_arg3(MacroAssembler* masm, Register arg) {
365 if (c_rarg3 != arg ) {
366 masm->mov(c_rarg3, arg);
367 }
368 }
369
370 void MacroAssembler::stop(const char* msg) {
371 if (ShowMessageBoxOnError) {
372 address rip = pc();
373 pusha(); // get regs on stack
374 lea(c_rarg1, InternalAddress(rip));
375 movq(c_rarg2, rsp); // pass pointer to regs array
376 }
377 // Skip AOT caching C strings in scratch buffer.
378 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
379 lea(c_rarg0, ExternalAddress((address) str));
380 andq(rsp, -16); // align stack as required by ABI
381 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
382 hlt();
383 }
384
385 void MacroAssembler::warn(const char* msg) {
386 push(rbp);
387 movq(rbp, rsp);
388 andq(rsp, -16); // align stack as required by push_CPU_state and call
389 push_CPU_state(); // keeps alignment at 16 bytes
390
391 #ifdef _WIN64
392 // Windows always allocates space for its register args
393 subq(rsp, frame::arg_reg_save_area_bytes);
394 #endif
395 lea(c_rarg0, ExternalAddress((address) msg));
396 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
397
398 #ifdef _WIN64
399 // restore stack pointer
400 addq(rsp, frame::arg_reg_save_area_bytes);
401 #endif
402 pop_CPU_state();
403 mov(rsp, rbp);
404 pop(rbp);
405 }
406
407 void MacroAssembler::print_state() {
408 address rip = pc();
409 pusha(); // get regs on stack
410 push(rbp);
411 movq(rbp, rsp);
412 andq(rsp, -16); // align stack as required by push_CPU_state and call
413 push_CPU_state(); // keeps alignment at 16 bytes
414
415 lea(c_rarg0, InternalAddress(rip));
416 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
417 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
418
419 pop_CPU_state();
420 mov(rsp, rbp);
421 pop(rbp);
422 popa();
423 }
424
425 #ifndef PRODUCT
426 extern "C" void findpc(intptr_t x);
427 #endif
428
429 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
430 // In order to get locks to work, we need to fake a in_VM state
431 if (ShowMessageBoxOnError) {
432 JavaThread* thread = JavaThread::current();
433 JavaThreadState saved_state = thread->thread_state();
434 thread->set_thread_state(_thread_in_vm);
435 #ifndef PRODUCT
436 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
437 ttyLocker ttyl;
438 BytecodeCounter::print();
439 }
440 #endif
441 // To see where a verify_oop failed, get $ebx+40/X for this frame.
442 // XXX correct this offset for amd64
443 // This is the value of eip which points to where verify_oop will return.
444 if (os::message_box(msg, "Execution stopped, print registers?")) {
445 print_state64(pc, regs);
446 BREAKPOINT;
447 }
448 }
449 fatal("DEBUG MESSAGE: %s", msg);
450 }
451
452 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
453 ttyLocker ttyl;
454 DebuggingContext debugging{};
455 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
456 #ifndef PRODUCT
457 tty->cr();
458 findpc(pc);
459 tty->cr();
460 #endif
461 #define PRINT_REG(rax, value) \
462 { tty->print("%s = ", #rax); os::print_location(tty, value); }
463 PRINT_REG(rax, regs[15]);
464 PRINT_REG(rbx, regs[12]);
465 PRINT_REG(rcx, regs[14]);
466 PRINT_REG(rdx, regs[13]);
467 PRINT_REG(rdi, regs[8]);
468 PRINT_REG(rsi, regs[9]);
469 PRINT_REG(rbp, regs[10]);
470 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
471 PRINT_REG(rsp, (intptr_t)(®s[16]));
472 PRINT_REG(r8 , regs[7]);
473 PRINT_REG(r9 , regs[6]);
474 PRINT_REG(r10, regs[5]);
475 PRINT_REG(r11, regs[4]);
476 PRINT_REG(r12, regs[3]);
477 PRINT_REG(r13, regs[2]);
478 PRINT_REG(r14, regs[1]);
479 PRINT_REG(r15, regs[0]);
480 #undef PRINT_REG
481 // Print some words near the top of the stack.
482 int64_t* rsp = ®s[16];
483 int64_t* dump_sp = rsp;
484 for (int col1 = 0; col1 < 8; col1++) {
485 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
486 os::print_location(tty, *dump_sp++);
487 }
488 for (int row = 0; row < 25; row++) {
489 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
490 for (int col = 0; col < 4; col++) {
491 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
492 }
493 tty->cr();
494 }
495 // Print some instructions around pc:
496 Disassembler::decode((address)pc-64, (address)pc);
497 tty->print_cr("--------");
498 Disassembler::decode((address)pc, (address)pc+32);
499 }
500
501 // The java_calling_convention describes stack locations as ideal slots on
502 // a frame with no abi restrictions. Since we must observe abi restrictions
503 // (like the placement of the register window) the slots must be biased by
504 // the following value.
505 static int reg2offset_in(VMReg r) {
506 // Account for saved rbp and return address
507 // This should really be in_preserve_stack_slots
508 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
509 }
510
511 static int reg2offset_out(VMReg r) {
512 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
513 }
514
515 // A long move
516 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
517
518 // The calling conventions assures us that each VMregpair is either
519 // all really one physical register or adjacent stack slots.
520
521 if (src.is_single_phys_reg() ) {
522 if (dst.is_single_phys_reg()) {
523 if (dst.first() != src.first()) {
524 mov(dst.first()->as_Register(), src.first()->as_Register());
525 }
526 } else {
527 assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
528 src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
529 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
530 }
531 } else if (dst.is_single_phys_reg()) {
532 assert(src.is_single_reg(), "not a stack pair");
533 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
534 } else {
535 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
536 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
537 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
538 }
539 }
540
541 // A double move
542 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
543
544 // The calling conventions assures us that each VMregpair is either
545 // all really one physical register or adjacent stack slots.
546
547 if (src.is_single_phys_reg() ) {
548 if (dst.is_single_phys_reg()) {
549 // In theory these overlap but the ordering is such that this is likely a nop
550 if ( src.first() != dst.first()) {
551 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
552 }
553 } else {
554 assert(dst.is_single_reg(), "not a stack pair");
555 movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
556 }
557 } else if (dst.is_single_phys_reg()) {
558 assert(src.is_single_reg(), "not a stack pair");
559 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
560 } else {
561 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
562 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
563 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
564 }
565 }
566
567
568 // A float arg may have to do float reg int reg conversion
569 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
570 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
571
572 // The calling conventions assures us that each VMregpair is either
573 // all really one physical register or adjacent stack slots.
574
575 if (src.first()->is_stack()) {
576 if (dst.first()->is_stack()) {
577 movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
578 movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
579 } else {
580 // stack to reg
581 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
582 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
583 }
584 } else if (dst.first()->is_stack()) {
585 // reg to stack
586 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
587 movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
588 } else {
589 // reg to reg
590 // In theory these overlap but the ordering is such that this is likely a nop
591 if ( src.first() != dst.first()) {
592 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
593 }
594 }
595 }
596
597 // On 64 bit we will store integer like items to the stack as
598 // 64 bits items (x86_32/64 abi) even though java would only store
599 // 32bits for a parameter. On 32bit it will simply be 32 bits
600 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
601 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
602 if (src.first()->is_stack()) {
603 if (dst.first()->is_stack()) {
604 // stack to stack
605 movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
606 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
607 } else {
608 // stack to reg
609 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
610 }
611 } else if (dst.first()->is_stack()) {
612 // reg to stack
613 // Do we really have to sign extend???
614 // __ movslq(src.first()->as_Register(), src.first()->as_Register());
615 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
616 } else {
617 // Do we really have to sign extend???
618 // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
619 if (dst.first() != src.first()) {
620 movq(dst.first()->as_Register(), src.first()->as_Register());
621 }
622 }
623 }
624
625 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
626 if (src.first()->is_stack()) {
627 if (dst.first()->is_stack()) {
628 // stack to stack
629 movq(rax, Address(rbp, reg2offset_in(src.first())));
630 movq(Address(rsp, reg2offset_out(dst.first())), rax);
631 } else {
632 // stack to reg
633 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
634 }
635 } else if (dst.first()->is_stack()) {
636 // reg to stack
637 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
638 } else {
639 if (dst.first() != src.first()) {
640 movq(dst.first()->as_Register(), src.first()->as_Register());
641 }
642 }
643 }
644
645 // An oop arg. Must pass a handle not the oop itself
646 void MacroAssembler::object_move(OopMap* map,
647 int oop_handle_offset,
648 int framesize_in_slots,
649 VMRegPair src,
650 VMRegPair dst,
651 bool is_receiver,
652 int* receiver_offset) {
653
654 // must pass a handle. First figure out the location we use as a handle
655
656 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
657
658 // See if oop is null if it is we need no handle
659
660 if (src.first()->is_stack()) {
661
662 // Oop is already on the stack as an argument
663 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
664 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
665 if (is_receiver) {
666 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
667 }
668
669 cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
670 lea(rHandle, Address(rbp, reg2offset_in(src.first())));
671 // conditionally move a null
672 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
673 } else {
674
675 // Oop is in a register we must store it to the space we reserve
676 // on the stack for oop_handles and pass a handle if oop is non-null
677
678 const Register rOop = src.first()->as_Register();
679 int oop_slot;
680 if (rOop == j_rarg0)
681 oop_slot = 0;
682 else if (rOop == j_rarg1)
683 oop_slot = 1;
684 else if (rOop == j_rarg2)
685 oop_slot = 2;
686 else if (rOop == j_rarg3)
687 oop_slot = 3;
688 else if (rOop == j_rarg4)
689 oop_slot = 4;
690 else {
691 assert(rOop == j_rarg5, "wrong register");
692 oop_slot = 5;
693 }
694
695 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
696 int offset = oop_slot*VMRegImpl::stack_slot_size;
697
698 map->set_oop(VMRegImpl::stack2reg(oop_slot));
699 // Store oop in handle area, may be null
700 movptr(Address(rsp, offset), rOop);
701 if (is_receiver) {
702 *receiver_offset = offset;
703 }
704
705 cmpptr(rOop, NULL_WORD);
706 lea(rHandle, Address(rsp, offset));
707 // conditionally move a null from the handle area where it was just stored
708 cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
709 }
710
711 // If arg is on the stack then place it otherwise it is already in correct reg.
712 if (dst.first()->is_stack()) {
713 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
714 }
715 }
716
717 void MacroAssembler::addptr(Register dst, int32_t imm32) {
718 addq(dst, imm32);
719 }
720
721 void MacroAssembler::addptr(Register dst, Register src) {
722 addq(dst, src);
723 }
724
725 void MacroAssembler::addptr(Address dst, Register src) {
726 addq(dst, src);
727 }
728
729 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
730 assert(rscratch != noreg || always_reachable(src), "missing");
731
732 if (reachable(src)) {
733 Assembler::addsd(dst, as_Address(src));
734 } else {
735 lea(rscratch, src);
736 Assembler::addsd(dst, Address(rscratch, 0));
737 }
738 }
739
740 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
741 assert(rscratch != noreg || always_reachable(src), "missing");
742
743 if (reachable(src)) {
744 addss(dst, as_Address(src));
745 } else {
746 lea(rscratch, src);
747 addss(dst, Address(rscratch, 0));
748 }
749 }
750
751 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
752 assert(rscratch != noreg || always_reachable(src), "missing");
753
754 if (reachable(src)) {
755 Assembler::addpd(dst, as_Address(src));
756 } else {
757 lea(rscratch, src);
758 Assembler::addpd(dst, Address(rscratch, 0));
759 }
760 }
761
762 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only.
763 // Stub code is generated once and never copied.
764 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
765 void MacroAssembler::align64() {
766 align(64, (uint)(uintptr_t)pc());
767 }
768
769 void MacroAssembler::align32() {
770 align(32, (uint)(uintptr_t)pc());
771 }
772
773 void MacroAssembler::align(uint modulus) {
774 // 8273459: Ensure alignment is possible with current segment alignment
775 assert(modulus <= (uintx)CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
776 align(modulus, offset());
777 }
778
779 void MacroAssembler::align(uint modulus, uint target) {
780 if (target % modulus != 0) {
781 nop(modulus - (target % modulus));
782 }
783 }
784
785 void MacroAssembler::push_f(XMMRegister r) {
786 subptr(rsp, wordSize);
787 movflt(Address(rsp, 0), r);
788 }
789
790 void MacroAssembler::pop_f(XMMRegister r) {
791 movflt(r, Address(rsp, 0));
792 addptr(rsp, wordSize);
793 }
794
795 void MacroAssembler::push_d(XMMRegister r) {
796 subptr(rsp, 2 * wordSize);
797 movdbl(Address(rsp, 0), r);
798 }
799
800 void MacroAssembler::pop_d(XMMRegister r) {
801 movdbl(r, Address(rsp, 0));
802 addptr(rsp, 2 * Interpreter::stackElementSize);
803 }
804
805 void MacroAssembler::push_ppx(Register src) {
806 if (VM_Version::supports_apx_f()) {
807 pushp(src);
808 } else {
809 Assembler::push(src);
810 }
811 }
812
813 void MacroAssembler::pop_ppx(Register dst) {
814 if (VM_Version::supports_apx_f()) {
815 popp(dst);
816 } else {
817 Assembler::pop(dst);
818 }
819 }
820
821 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
822 // Used in sign-masking with aligned address.
823 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
824 assert(rscratch != noreg || always_reachable(src), "missing");
825
826 if (UseAVX > 2 &&
827 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
828 (dst->encoding() >= 16)) {
829 vpand(dst, dst, src, AVX_512bit, rscratch);
830 } else if (reachable(src)) {
831 Assembler::andpd(dst, as_Address(src));
832 } else {
833 lea(rscratch, src);
834 Assembler::andpd(dst, Address(rscratch, 0));
835 }
836 }
837
838 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
839 // Used in sign-masking with aligned address.
840 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
841 assert(rscratch != noreg || always_reachable(src), "missing");
842
843 if (reachable(src)) {
844 Assembler::andps(dst, as_Address(src));
845 } else {
846 lea(rscratch, src);
847 Assembler::andps(dst, Address(rscratch, 0));
848 }
849 }
850
851 void MacroAssembler::andptr(Register dst, int32_t imm32) {
852 andq(dst, imm32);
853 }
854
855 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
856 assert(rscratch != noreg || always_reachable(src), "missing");
857
858 if (reachable(src)) {
859 andq(dst, as_Address(src));
860 } else {
861 lea(rscratch, src);
862 andq(dst, Address(rscratch, 0));
863 }
864 }
865
866 void MacroAssembler::atomic_incl(Address counter_addr) {
867 lock();
868 incrementl(counter_addr);
869 }
870
871 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
872 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
873
874 if (reachable(counter_addr)) {
875 atomic_incl(as_Address(counter_addr));
876 } else {
877 lea(rscratch, counter_addr);
878 atomic_incl(Address(rscratch, 0));
879 }
880 }
881
882 void MacroAssembler::atomic_incq(Address counter_addr) {
883 lock();
884 incrementq(counter_addr);
885 }
886
887 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
888 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
889
890 if (reachable(counter_addr)) {
891 atomic_incq(as_Address(counter_addr));
892 } else {
893 lea(rscratch, counter_addr);
894 atomic_incq(Address(rscratch, 0));
895 }
896 }
897
898 // Writes to stack successive pages until offset reached to check for
899 // stack overflow + shadow pages. This clobbers tmp.
900 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
901 movptr(tmp, rsp);
902 // Bang stack for total size given plus shadow page size.
903 // Bang one page at a time because large size can bang beyond yellow and
904 // red zones.
905 Label loop;
906 bind(loop);
907 movl(Address(tmp, (-(int)os::vm_page_size())), size );
908 subptr(tmp, (int)os::vm_page_size());
909 subl(size, (int)os::vm_page_size());
910 jcc(Assembler::greater, loop);
911
912 // Bang down shadow pages too.
913 // At this point, (tmp-0) is the last address touched, so don't
914 // touch it again. (It was touched as (tmp-pagesize) but then tmp
915 // was post-decremented.) Skip this address by starting at i=1, and
916 // touch a few more pages below. N.B. It is important to touch all
917 // the way down including all pages in the shadow zone.
918 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
919 // this could be any sized move but this is can be a debugging crumb
920 // so the bigger the better.
921 movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
922 }
923 }
924
925 void MacroAssembler::reserved_stack_check() {
926 // testing if reserved zone needs to be enabled
927 Label no_reserved_zone_enabling;
928
929 cmpptr(rsp, Address(r15_thread, JavaThread::reserved_stack_activation_offset()));
930 jcc(Assembler::below, no_reserved_zone_enabling);
931
932 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), r15_thread);
933 jump(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
934 should_not_reach_here();
935
936 bind(no_reserved_zone_enabling);
937 }
938
939 void MacroAssembler::c2bool(Register x) {
940 // implements x == 0 ? 0 : 1
941 // note: must only look at least-significant byte of x
942 // since C-style booleans are stored in one byte
943 // only! (was bug)
944 andl(x, 0xFF);
945 setb(Assembler::notZero, x);
946 }
947
948 // Wouldn't need if AddressLiteral version had new name
949 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
950 Assembler::call(L, rtype);
951 }
952
953 void MacroAssembler::call(Register entry) {
954 Assembler::call(entry);
955 }
956
957 void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
958 assert(rscratch != noreg || always_reachable(entry), "missing");
959
960 if (reachable(entry)) {
961 Assembler::call_literal(entry.target(), entry.rspec());
962 } else {
963 lea(rscratch, entry);
964 Assembler::call(rscratch);
965 }
966 }
967
968 void MacroAssembler::ic_call(address entry, jint method_index) {
969 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
970 // Needs full 64-bit immediate for later patching.
971 mov64(rax, (int64_t)Universe::non_oop_word());
972 call(AddressLiteral(entry, rh));
973 }
974
975 int MacroAssembler::ic_check_size() {
976 return UseCompactObjectHeaders ? 17 : 14;
977 }
978
979 int MacroAssembler::ic_check(int end_alignment) {
980 Register receiver = j_rarg0;
981 Register data = rax;
982 Register temp = rscratch1;
983
984 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
985 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
986 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
987 // before the inline cache check here, and not after
988 align(end_alignment, offset() + ic_check_size());
989
990 int uep_offset = offset();
991
992 if (UseCompactObjectHeaders) {
993 load_narrow_klass_compact(temp, receiver);
994 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
995 } else if (UseCompressedClassPointers) {
996 movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
997 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
998 } else {
999 movptr(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
1000 cmpptr(temp, Address(data, CompiledICData::speculated_klass_offset()));
1001 }
1002
1003 // if inline cache check fails, then jump to runtime routine
1004 jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1005 assert((offset() % end_alignment) == 0, "Misaligned verified entry point (%d, %d, %d)", uep_offset, offset(), end_alignment);
1006
1007 return uep_offset;
1008 }
1009
1010 void MacroAssembler::emit_static_call_stub() {
1011 // Static stub relocation also tags the Method* in the code-stream.
1012 mov_metadata(rbx, (Metadata*) nullptr); // Method is zapped till fixup time.
1013 // This is recognized as unresolved by relocs/nativeinst/ic code.
1014 jump(RuntimeAddress(pc()));
1015 }
1016
1017 // Implementation of call_VM versions
1018
1019 void MacroAssembler::call_VM(Register oop_result,
1020 address entry_point,
1021 bool check_exceptions) {
1022 Label C, E;
1023 call(C, relocInfo::none);
1024 jmp(E);
1025
1026 bind(C);
1027 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1028 ret(0);
1029
1030 bind(E);
1031 }
1032
1033 void MacroAssembler::call_VM(Register oop_result,
1034 address entry_point,
1035 Register arg_1,
1036 bool check_exceptions) {
1037 Label C, E;
1038 call(C, relocInfo::none);
1039 jmp(E);
1040
1041 bind(C);
1042 pass_arg1(this, arg_1);
1043 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1044 ret(0);
1045
1046 bind(E);
1047 }
1048
1049 void MacroAssembler::call_VM(Register oop_result,
1050 address entry_point,
1051 Register arg_1,
1052 Register arg_2,
1053 bool check_exceptions) {
1054 Label C, E;
1055 call(C, relocInfo::none);
1056 jmp(E);
1057
1058 bind(C);
1059
1060 assert_different_registers(arg_1, c_rarg2);
1061
1062 pass_arg2(this, arg_2);
1063 pass_arg1(this, arg_1);
1064 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1065 ret(0);
1066
1067 bind(E);
1068 }
1069
1070 void MacroAssembler::call_VM(Register oop_result,
1071 address entry_point,
1072 Register arg_1,
1073 Register arg_2,
1074 Register arg_3,
1075 bool check_exceptions) {
1076 Label C, E;
1077 call(C, relocInfo::none);
1078 jmp(E);
1079
1080 bind(C);
1081
1082 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1083 assert_different_registers(arg_2, c_rarg3);
1084 pass_arg3(this, arg_3);
1085 pass_arg2(this, arg_2);
1086 pass_arg1(this, arg_1);
1087 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1088 ret(0);
1089
1090 bind(E);
1091 }
1092
1093 void MacroAssembler::call_VM(Register oop_result,
1094 Register last_java_sp,
1095 address entry_point,
1096 int number_of_arguments,
1097 bool check_exceptions) {
1098 call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1099 }
1100
1101 void MacroAssembler::call_VM(Register oop_result,
1102 Register last_java_sp,
1103 address entry_point,
1104 Register arg_1,
1105 bool check_exceptions) {
1106 pass_arg1(this, arg_1);
1107 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1108 }
1109
1110 void MacroAssembler::call_VM(Register oop_result,
1111 Register last_java_sp,
1112 address entry_point,
1113 Register arg_1,
1114 Register arg_2,
1115 bool check_exceptions) {
1116
1117 assert_different_registers(arg_1, c_rarg2);
1118 pass_arg2(this, arg_2);
1119 pass_arg1(this, arg_1);
1120 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1121 }
1122
1123 void MacroAssembler::call_VM(Register oop_result,
1124 Register last_java_sp,
1125 address entry_point,
1126 Register arg_1,
1127 Register arg_2,
1128 Register arg_3,
1129 bool check_exceptions) {
1130 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1131 assert_different_registers(arg_2, c_rarg3);
1132 pass_arg3(this, arg_3);
1133 pass_arg2(this, arg_2);
1134 pass_arg1(this, arg_1);
1135 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1136 }
1137
1138 void MacroAssembler::super_call_VM(Register oop_result,
1139 Register last_java_sp,
1140 address entry_point,
1141 int number_of_arguments,
1142 bool check_exceptions) {
1143 MacroAssembler::call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1144 }
1145
1146 void MacroAssembler::super_call_VM(Register oop_result,
1147 Register last_java_sp,
1148 address entry_point,
1149 Register arg_1,
1150 bool check_exceptions) {
1151 pass_arg1(this, arg_1);
1152 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1153 }
1154
1155 void MacroAssembler::super_call_VM(Register oop_result,
1156 Register last_java_sp,
1157 address entry_point,
1158 Register arg_1,
1159 Register arg_2,
1160 bool check_exceptions) {
1161
1162 assert_different_registers(arg_1, c_rarg2);
1163 pass_arg2(this, arg_2);
1164 pass_arg1(this, arg_1);
1165 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1166 }
1167
1168 void MacroAssembler::super_call_VM(Register oop_result,
1169 Register last_java_sp,
1170 address entry_point,
1171 Register arg_1,
1172 Register arg_2,
1173 Register arg_3,
1174 bool check_exceptions) {
1175 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1176 assert_different_registers(arg_2, c_rarg3);
1177 pass_arg3(this, arg_3);
1178 pass_arg2(this, arg_2);
1179 pass_arg1(this, arg_1);
1180 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1181 }
1182
1183 void MacroAssembler::call_VM_base(Register oop_result,
1184 Register last_java_sp,
1185 address entry_point,
1186 int number_of_arguments,
1187 bool check_exceptions) {
1188 Register java_thread = r15_thread;
1189
1190 // determine last_java_sp register
1191 if (!last_java_sp->is_valid()) {
1192 last_java_sp = rsp;
1193 }
1194 // debugging support
1195 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1196 #ifdef ASSERT
1197 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1198 // r12 is the heapbase.
1199 if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
1200 #endif // ASSERT
1201
1202 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1203 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1204
1205 // push java thread (becomes first argument of C function)
1206
1207 mov(c_rarg0, r15_thread);
1208
1209 // set last Java frame before call
1210 assert(last_java_sp != rbp, "can't use ebp/rbp");
1211
1212 // Only interpreter should have to set fp
1213 set_last_Java_frame(last_java_sp, rbp, nullptr, rscratch1);
1214
1215 // do the call, remove parameters
1216 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1217
1218 #ifdef ASSERT
1219 // Check that thread register is not clobbered.
1220 guarantee(java_thread != rax, "change this code");
1221 push(rax);
1222 { Label L;
1223 get_thread_slow(rax);
1224 cmpptr(java_thread, rax);
1225 jcc(Assembler::equal, L);
1226 STOP("MacroAssembler::call_VM_base: java_thread not callee saved?");
1227 bind(L);
1228 }
1229 pop(rax);
1230 #endif
1231
1232 // reset last Java frame
1233 // Only interpreter should have to clear fp
1234 reset_last_Java_frame(true);
1235
1236 // C++ interp handles this in the interpreter
1237 check_and_handle_popframe();
1238 check_and_handle_earlyret();
1239
1240 if (check_exceptions) {
1241 // check for pending exceptions (java_thread is set upon return)
1242 cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1243 // This used to conditionally jump to forward_exception however it is
1244 // possible if we relocate that the branch will not reach. So we must jump
1245 // around so we can always reach
1246
1247 Label ok;
1248 jcc(Assembler::equal, ok);
1249 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1250 bind(ok);
1251 }
1252
1253 // get oop result if there is one and reset the value in the thread
1254 if (oop_result->is_valid()) {
1255 get_vm_result_oop(oop_result);
1256 }
1257 }
1258
1259 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1260 // Calculate the value for last_Java_sp somewhat subtle.
1261 // call_VM does an intermediate call which places a return address on
1262 // the stack just under the stack pointer as the user finished with it.
1263 // This allows use to retrieve last_Java_pc from last_Java_sp[-1].
1264
1265 // We've pushed one address, correct last_Java_sp
1266 lea(rax, Address(rsp, wordSize));
1267
1268 call_VM_base(oop_result, rax, entry_point, number_of_arguments, check_exceptions);
1269 }
1270
1271 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1272 void MacroAssembler::call_VM_leaf0(address entry_point) {
1273 MacroAssembler::call_VM_leaf_base(entry_point, 0);
1274 }
1275
1276 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1277 call_VM_leaf_base(entry_point, number_of_arguments);
1278 }
1279
1280 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1281 pass_arg0(this, arg_0);
1282 call_VM_leaf(entry_point, 1);
1283 }
1284
1285 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1286
1287 assert_different_registers(arg_0, c_rarg1);
1288 pass_arg1(this, arg_1);
1289 pass_arg0(this, arg_0);
1290 call_VM_leaf(entry_point, 2);
1291 }
1292
1293 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1294 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1295 assert_different_registers(arg_1, c_rarg2);
1296 pass_arg2(this, arg_2);
1297 pass_arg1(this, arg_1);
1298 pass_arg0(this, arg_0);
1299 call_VM_leaf(entry_point, 3);
1300 }
1301
1302 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1303 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1304 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1305 assert_different_registers(arg_2, c_rarg3);
1306 pass_arg3(this, arg_3);
1307 pass_arg2(this, arg_2);
1308 pass_arg1(this, arg_1);
1309 pass_arg0(this, arg_0);
1310 call_VM_leaf(entry_point, 3);
1311 }
1312
1313 void MacroAssembler::super_call_VM_leaf(address entry_point) {
1314 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1315 }
1316
1317 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1318 pass_arg0(this, arg_0);
1319 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1320 }
1321
1322 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1323 assert_different_registers(arg_0, c_rarg1);
1324 pass_arg1(this, arg_1);
1325 pass_arg0(this, arg_0);
1326 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1327 }
1328
1329 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1330 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1331 assert_different_registers(arg_1, c_rarg2);
1332 pass_arg2(this, arg_2);
1333 pass_arg1(this, arg_1);
1334 pass_arg0(this, arg_0);
1335 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1336 }
1337
1338 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1339 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1340 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1341 assert_different_registers(arg_2, c_rarg3);
1342 pass_arg3(this, arg_3);
1343 pass_arg2(this, arg_2);
1344 pass_arg1(this, arg_1);
1345 pass_arg0(this, arg_0);
1346 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1347 }
1348
1349 void MacroAssembler::get_vm_result_oop(Register oop_result) {
1350 movptr(oop_result, Address(r15_thread, JavaThread::vm_result_oop_offset()));
1351 movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
1352 verify_oop_msg(oop_result, "broken oop in call_VM_base");
1353 }
1354
1355 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
1356 movptr(metadata_result, Address(r15_thread, JavaThread::vm_result_metadata_offset()));
1357 movptr(Address(r15_thread, JavaThread::vm_result_metadata_offset()), NULL_WORD);
1358 }
1359
1360 void MacroAssembler::check_and_handle_earlyret() {
1361 }
1362
1363 void MacroAssembler::check_and_handle_popframe() {
1364 }
1365
1366 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1367 assert(rscratch != noreg || always_reachable(src1), "missing");
1368
1369 if (reachable(src1)) {
1370 cmpl(as_Address(src1), imm);
1371 } else {
1372 lea(rscratch, src1);
1373 cmpl(Address(rscratch, 0), imm);
1374 }
1375 }
1376
1377 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1378 assert(!src2.is_lval(), "use cmpptr");
1379 assert(rscratch != noreg || always_reachable(src2), "missing");
1380
1381 if (reachable(src2)) {
1382 cmpl(src1, as_Address(src2));
1383 } else {
1384 lea(rscratch, src2);
1385 cmpl(src1, Address(rscratch, 0));
1386 }
1387 }
1388
1389 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1390 Assembler::cmpl(src1, imm);
1391 }
1392
1393 void MacroAssembler::cmp32(Register src1, Address src2) {
1394 Assembler::cmpl(src1, src2);
1395 }
1396
1397 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1398 ucomisd(opr1, opr2);
1399
1400 Label L;
1401 if (unordered_is_less) {
1402 movl(dst, -1);
1403 jcc(Assembler::parity, L);
1404 jcc(Assembler::below , L);
1405 movl(dst, 0);
1406 jcc(Assembler::equal , L);
1407 increment(dst);
1408 } else { // unordered is greater
1409 movl(dst, 1);
1410 jcc(Assembler::parity, L);
1411 jcc(Assembler::above , L);
1412 movl(dst, 0);
1413 jcc(Assembler::equal , L);
1414 decrementl(dst);
1415 }
1416 bind(L);
1417 }
1418
1419 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1420 ucomiss(opr1, opr2);
1421
1422 Label L;
1423 if (unordered_is_less) {
1424 movl(dst, -1);
1425 jcc(Assembler::parity, L);
1426 jcc(Assembler::below , L);
1427 movl(dst, 0);
1428 jcc(Assembler::equal , L);
1429 increment(dst);
1430 } else { // unordered is greater
1431 movl(dst, 1);
1432 jcc(Assembler::parity, L);
1433 jcc(Assembler::above , L);
1434 movl(dst, 0);
1435 jcc(Assembler::equal , L);
1436 decrementl(dst);
1437 }
1438 bind(L);
1439 }
1440
1441
1442 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1443 assert(rscratch != noreg || always_reachable(src1), "missing");
1444
1445 if (reachable(src1)) {
1446 cmpb(as_Address(src1), imm);
1447 } else {
1448 lea(rscratch, src1);
1449 cmpb(Address(rscratch, 0), imm);
1450 }
1451 }
1452
1453 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1454 assert(rscratch != noreg || always_reachable(src2), "missing");
1455
1456 if (src2.is_lval()) {
1457 movptr(rscratch, src2);
1458 Assembler::cmpq(src1, rscratch);
1459 } else if (reachable(src2)) {
1460 cmpq(src1, as_Address(src2));
1461 } else {
1462 lea(rscratch, src2);
1463 Assembler::cmpq(src1, Address(rscratch, 0));
1464 }
1465 }
1466
1467 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1468 assert(src2.is_lval(), "not a mem-mem compare");
1469 // moves src2's literal address
1470 movptr(rscratch, src2);
1471 Assembler::cmpq(src1, rscratch);
1472 }
1473
1474 void MacroAssembler::cmpoop(Register src1, Register src2) {
1475 cmpptr(src1, src2);
1476 }
1477
1478 void MacroAssembler::cmpoop(Register src1, Address src2) {
1479 cmpptr(src1, src2);
1480 }
1481
1482 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1483 movoop(rscratch, src2);
1484 cmpptr(src1, rscratch);
1485 }
1486
1487 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1488 assert(rscratch != noreg || always_reachable(adr), "missing");
1489
1490 if (reachable(adr)) {
1491 lock();
1492 cmpxchgptr(reg, as_Address(adr));
1493 } else {
1494 lea(rscratch, adr);
1495 lock();
1496 cmpxchgptr(reg, Address(rscratch, 0));
1497 }
1498 }
1499
1500 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1501 cmpxchgq(reg, adr);
1502 }
1503
1504 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1505 assert(rscratch != noreg || always_reachable(src), "missing");
1506
1507 if (reachable(src)) {
1508 Assembler::comisd(dst, as_Address(src));
1509 } else {
1510 lea(rscratch, src);
1511 Assembler::comisd(dst, Address(rscratch, 0));
1512 }
1513 }
1514
1515 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1516 assert(rscratch != noreg || always_reachable(src), "missing");
1517
1518 if (reachable(src)) {
1519 Assembler::comiss(dst, as_Address(src));
1520 } else {
1521 lea(rscratch, src);
1522 Assembler::comiss(dst, Address(rscratch, 0));
1523 }
1524 }
1525
1526
1527 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1528 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1529
1530 Condition negated_cond = negate_condition(cond);
1531 Label L;
1532 jcc(negated_cond, L);
1533 pushf(); // Preserve flags
1534 atomic_incl(counter_addr, rscratch);
1535 popf();
1536 bind(L);
1537 }
1538
1539 int MacroAssembler::corrected_idivl(Register reg) {
1540 // Full implementation of Java idiv and irem; checks for
1541 // special case as described in JVM spec., p.243 & p.271.
1542 // The function returns the (pc) offset of the idivl
1543 // instruction - may be needed for implicit exceptions.
1544 //
1545 // normal case special case
1546 //
1547 // input : rax,: dividend min_int
1548 // reg: divisor (may not be rax,/rdx) -1
1549 //
1550 // output: rax,: quotient (= rax, idiv reg) min_int
1551 // rdx: remainder (= rax, irem reg) 0
1552 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1553 const int min_int = 0x80000000;
1554 Label normal_case, special_case;
1555
1556 // check for special case
1557 cmpl(rax, min_int);
1558 jcc(Assembler::notEqual, normal_case);
1559 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1560 cmpl(reg, -1);
1561 jcc(Assembler::equal, special_case);
1562
1563 // handle normal case
1564 bind(normal_case);
1565 cdql();
1566 int idivl_offset = offset();
1567 idivl(reg);
1568
1569 // normal and special case exit
1570 bind(special_case);
1571
1572 return idivl_offset;
1573 }
1574
1575
1576
1577 void MacroAssembler::decrementl(Register reg, int value) {
1578 if (value == min_jint) {subl(reg, value) ; return; }
1579 if (value < 0) { incrementl(reg, -value); return; }
1580 if (value == 0) { ; return; }
1581 if (value == 1 && UseIncDec) { decl(reg) ; return; }
1582 /* else */ { subl(reg, value) ; return; }
1583 }
1584
1585 void MacroAssembler::decrementl(Address dst, int value) {
1586 if (value == min_jint) {subl(dst, value) ; return; }
1587 if (value < 0) { incrementl(dst, -value); return; }
1588 if (value == 0) { ; return; }
1589 if (value == 1 && UseIncDec) { decl(dst) ; return; }
1590 /* else */ { subl(dst, value) ; return; }
1591 }
1592
1593 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1594 assert(shift_value > 0, "illegal shift value");
1595 Label _is_positive;
1596 testl (reg, reg);
1597 jcc (Assembler::positive, _is_positive);
1598 int offset = (1 << shift_value) - 1 ;
1599
1600 if (offset == 1) {
1601 incrementl(reg);
1602 } else {
1603 addl(reg, offset);
1604 }
1605
1606 bind (_is_positive);
1607 sarl(reg, shift_value);
1608 }
1609
1610 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1611 assert(rscratch != noreg || always_reachable(src), "missing");
1612
1613 if (reachable(src)) {
1614 Assembler::divsd(dst, as_Address(src));
1615 } else {
1616 lea(rscratch, src);
1617 Assembler::divsd(dst, Address(rscratch, 0));
1618 }
1619 }
1620
1621 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1622 assert(rscratch != noreg || always_reachable(src), "missing");
1623
1624 if (reachable(src)) {
1625 Assembler::divss(dst, as_Address(src));
1626 } else {
1627 lea(rscratch, src);
1628 Assembler::divss(dst, Address(rscratch, 0));
1629 }
1630 }
1631
1632 void MacroAssembler::enter() {
1633 push(rbp);
1634 mov(rbp, rsp);
1635 }
1636
1637 void MacroAssembler::post_call_nop() {
1638 if (!Continuations::enabled()) {
1639 return;
1640 }
1641 InstructionMark im(this);
1642 relocate(post_call_nop_Relocation::spec());
1643 InlineSkippedInstructionsCounter skipCounter(this);
1644 emit_int8((uint8_t)0x0f);
1645 emit_int8((uint8_t)0x1f);
1646 emit_int8((uint8_t)0x84);
1647 emit_int8((uint8_t)0x00);
1648 emit_int32(0x00);
1649 }
1650
1651 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1652 assert(rscratch != noreg || always_reachable(src), "missing");
1653 if (reachable(src)) {
1654 Assembler::mulpd(dst, as_Address(src));
1655 } else {
1656 lea(rscratch, src);
1657 Assembler::mulpd(dst, Address(rscratch, 0));
1658 }
1659 }
1660
1661 // dst = c = a * b + c
1662 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1663 Assembler::vfmadd231sd(c, a, b);
1664 if (dst != c) {
1665 movdbl(dst, c);
1666 }
1667 }
1668
1669 // dst = c = a * b + c
1670 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1671 Assembler::vfmadd231ss(c, a, b);
1672 if (dst != c) {
1673 movflt(dst, c);
1674 }
1675 }
1676
1677 // dst = c = a * b + c
1678 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1679 Assembler::vfmadd231pd(c, a, b, vector_len);
1680 if (dst != c) {
1681 vmovdqu(dst, c);
1682 }
1683 }
1684
1685 // dst = c = a * b + c
1686 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1687 Assembler::vfmadd231ps(c, a, b, vector_len);
1688 if (dst != c) {
1689 vmovdqu(dst, c);
1690 }
1691 }
1692
1693 // dst = c = a * b + c
1694 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1695 Assembler::vfmadd231pd(c, a, b, vector_len);
1696 if (dst != c) {
1697 vmovdqu(dst, c);
1698 }
1699 }
1700
1701 // dst = c = a * b + c
1702 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1703 Assembler::vfmadd231ps(c, a, b, vector_len);
1704 if (dst != c) {
1705 vmovdqu(dst, c);
1706 }
1707 }
1708
1709 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
1710 assert(rscratch != noreg || always_reachable(dst), "missing");
1711
1712 if (reachable(dst)) {
1713 incrementl(as_Address(dst));
1714 } else {
1715 lea(rscratch, dst);
1716 incrementl(Address(rscratch, 0));
1717 }
1718 }
1719
1720 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
1721 incrementl(as_Address(dst, rscratch));
1722 }
1723
1724 void MacroAssembler::incrementl(Register reg, int value) {
1725 if (value == min_jint) {addl(reg, value) ; return; }
1726 if (value < 0) { decrementl(reg, -value); return; }
1727 if (value == 0) { ; return; }
1728 if (value == 1 && UseIncDec) { incl(reg) ; return; }
1729 /* else */ { addl(reg, value) ; return; }
1730 }
1731
1732 void MacroAssembler::incrementl(Address dst, int value) {
1733 if (value == min_jint) {addl(dst, value) ; return; }
1734 if (value < 0) { decrementl(dst, -value); return; }
1735 if (value == 0) { ; return; }
1736 if (value == 1 && UseIncDec) { incl(dst) ; return; }
1737 /* else */ { addl(dst, value) ; return; }
1738 }
1739
1740 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
1741 assert(rscratch != noreg || always_reachable(dst), "missing");
1742 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump");
1743 if (reachable(dst)) {
1744 jmp_literal(dst.target(), dst.rspec());
1745 } else {
1746 lea(rscratch, dst);
1747 jmp(rscratch);
1748 }
1749 }
1750
1751 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
1752 assert(rscratch != noreg || always_reachable(dst), "missing");
1753 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump_cc");
1754 if (reachable(dst)) {
1755 InstructionMark im(this);
1756 relocate(dst.reloc());
1757 const int short_size = 2;
1758 const int long_size = 6;
1759 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
1760 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
1761 // 0111 tttn #8-bit disp
1762 emit_int8(0x70 | cc);
1763 emit_int8((offs - short_size) & 0xFF);
1764 } else {
1765 // 0000 1111 1000 tttn #32-bit disp
1766 emit_int8(0x0F);
1767 emit_int8((unsigned char)(0x80 | cc));
1768 emit_int32(offs - long_size);
1769 }
1770 } else {
1771 #ifdef ASSERT
1772 warning("reversing conditional branch");
1773 #endif /* ASSERT */
1774 Label skip;
1775 jccb(reverse[cc], skip);
1776 lea(rscratch, dst);
1777 Assembler::jmp(rscratch);
1778 bind(skip);
1779 }
1780 }
1781
1782 void MacroAssembler::cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch) {
1783 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
1784 assert(rscratch != noreg || always_reachable(mxcsr_std), "missing");
1785
1786 stmxcsr(mxcsr_save);
1787 movl(tmp, mxcsr_save);
1788 if (EnableX86ECoreOpts) {
1789 // The mxcsr_std has status bits set for performance on ECore
1790 orl(tmp, 0x003f);
1791 } else {
1792 // Mask out status bits (only check control and mask bits)
1793 andl(tmp, 0xFFC0);
1794 }
1795 cmp32(tmp, mxcsr_std, rscratch);
1796 }
1797
1798 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
1799 assert(rscratch != noreg || always_reachable(src), "missing");
1800
1801 if (reachable(src)) {
1802 Assembler::ldmxcsr(as_Address(src));
1803 } else {
1804 lea(rscratch, src);
1805 Assembler::ldmxcsr(Address(rscratch, 0));
1806 }
1807 }
1808
1809 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1810 int off = offset();
1811 movsbl(dst, src); // movsxb
1812 return off;
1813 }
1814
1815 // Note: load_signed_short used to be called load_signed_word.
1816 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
1817 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
1818 // The term "word" in HotSpot means a 32- or 64-bit machine word.
1819 int MacroAssembler::load_signed_short(Register dst, Address src) {
1820 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
1821 // version but this is what 64bit has always done. This seems to imply
1822 // that users are only using 32bits worth.
1823 int off = offset();
1824 movswl(dst, src); // movsxw
1825 return off;
1826 }
1827
1828 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1829 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1830 // and "3.9 Partial Register Penalties", p. 22).
1831 int off = offset();
1832 movzbl(dst, src); // movzxb
1833 return off;
1834 }
1835
1836 // Note: load_unsigned_short used to be called load_unsigned_word.
1837 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1838 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1839 // and "3.9 Partial Register Penalties", p. 22).
1840 int off = offset();
1841 movzwl(dst, src); // movzxw
1842 return off;
1843 }
1844
1845 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1846 switch (size_in_bytes) {
1847 case 8: movq(dst, src); break;
1848 case 4: movl(dst, src); break;
1849 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1850 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1851 default: ShouldNotReachHere();
1852 }
1853 }
1854
1855 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1856 switch (size_in_bytes) {
1857 case 8: movq(dst, src); break;
1858 case 4: movl(dst, src); break;
1859 case 2: movw(dst, src); break;
1860 case 1: movb(dst, src); break;
1861 default: ShouldNotReachHere();
1862 }
1863 }
1864
1865 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
1866 assert(rscratch != noreg || always_reachable(dst), "missing");
1867
1868 if (reachable(dst)) {
1869 movl(as_Address(dst), src);
1870 } else {
1871 lea(rscratch, dst);
1872 movl(Address(rscratch, 0), src);
1873 }
1874 }
1875
1876 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
1877 if (reachable(src)) {
1878 movl(dst, as_Address(src));
1879 } else {
1880 lea(dst, src);
1881 movl(dst, Address(dst, 0));
1882 }
1883 }
1884
1885 // C++ bool manipulation
1886
1887 void MacroAssembler::movbool(Register dst, Address src) {
1888 if(sizeof(bool) == 1)
1889 movb(dst, src);
1890 else if(sizeof(bool) == 2)
1891 movw(dst, src);
1892 else if(sizeof(bool) == 4)
1893 movl(dst, src);
1894 else
1895 // unsupported
1896 ShouldNotReachHere();
1897 }
1898
1899 void MacroAssembler::movbool(Address dst, bool boolconst) {
1900 if(sizeof(bool) == 1)
1901 movb(dst, (int) boolconst);
1902 else if(sizeof(bool) == 2)
1903 movw(dst, (int) boolconst);
1904 else if(sizeof(bool) == 4)
1905 movl(dst, (int) boolconst);
1906 else
1907 // unsupported
1908 ShouldNotReachHere();
1909 }
1910
1911 void MacroAssembler::movbool(Address dst, Register src) {
1912 if(sizeof(bool) == 1)
1913 movb(dst, src);
1914 else if(sizeof(bool) == 2)
1915 movw(dst, src);
1916 else if(sizeof(bool) == 4)
1917 movl(dst, src);
1918 else
1919 // unsupported
1920 ShouldNotReachHere();
1921 }
1922
1923 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1924 assert(rscratch != noreg || always_reachable(src), "missing");
1925
1926 if (reachable(src)) {
1927 movdl(dst, as_Address(src));
1928 } else {
1929 lea(rscratch, src);
1930 movdl(dst, Address(rscratch, 0));
1931 }
1932 }
1933
1934 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
1935 assert(rscratch != noreg || always_reachable(src), "missing");
1936
1937 if (reachable(src)) {
1938 movq(dst, as_Address(src));
1939 } else {
1940 lea(rscratch, src);
1941 movq(dst, Address(rscratch, 0));
1942 }
1943 }
1944
1945 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1946 assert(rscratch != noreg || always_reachable(src), "missing");
1947
1948 if (reachable(src)) {
1949 if (UseXmmLoadAndClearUpper) {
1950 movsd (dst, as_Address(src));
1951 } else {
1952 movlpd(dst, as_Address(src));
1953 }
1954 } else {
1955 lea(rscratch, src);
1956 if (UseXmmLoadAndClearUpper) {
1957 movsd (dst, Address(rscratch, 0));
1958 } else {
1959 movlpd(dst, Address(rscratch, 0));
1960 }
1961 }
1962 }
1963
1964 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
1965 assert(rscratch != noreg || always_reachable(src), "missing");
1966
1967 if (reachable(src)) {
1968 movss(dst, as_Address(src));
1969 } else {
1970 lea(rscratch, src);
1971 movss(dst, Address(rscratch, 0));
1972 }
1973 }
1974
1975 void MacroAssembler::movptr(Register dst, Register src) {
1976 movq(dst, src);
1977 }
1978
1979 void MacroAssembler::movptr(Register dst, Address src) {
1980 movq(dst, src);
1981 }
1982
1983 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
1984 void MacroAssembler::movptr(Register dst, intptr_t src) {
1985 if (is_uimm32(src)) {
1986 movl(dst, checked_cast<uint32_t>(src));
1987 } else if (is_simm32(src)) {
1988 movq(dst, checked_cast<int32_t>(src));
1989 } else {
1990 mov64(dst, src);
1991 }
1992 }
1993
1994 void MacroAssembler::movptr(Address dst, Register src) {
1995 movq(dst, src);
1996 }
1997
1998 void MacroAssembler::movptr(Address dst, int32_t src) {
1999 movslq(dst, src);
2000 }
2001
2002 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2003 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2004 Assembler::movdqu(dst, src);
2005 }
2006
2007 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2008 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2009 Assembler::movdqu(dst, src);
2010 }
2011
2012 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2013 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2014 Assembler::movdqu(dst, src);
2015 }
2016
2017 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2018 assert(rscratch != noreg || always_reachable(src), "missing");
2019
2020 if (reachable(src)) {
2021 movdqu(dst, as_Address(src));
2022 } else {
2023 lea(rscratch, src);
2024 movdqu(dst, Address(rscratch, 0));
2025 }
2026 }
2027
2028 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2029 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2030 Assembler::vmovdqu(dst, src);
2031 }
2032
2033 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2034 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2035 Assembler::vmovdqu(dst, src);
2036 }
2037
2038 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2039 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2040 Assembler::vmovdqu(dst, src);
2041 }
2042
2043 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2044 assert(rscratch != noreg || always_reachable(src), "missing");
2045
2046 if (reachable(src)) {
2047 vmovdqu(dst, as_Address(src));
2048 }
2049 else {
2050 lea(rscratch, src);
2051 vmovdqu(dst, Address(rscratch, 0));
2052 }
2053 }
2054
2055 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2056 assert(rscratch != noreg || always_reachable(src), "missing");
2057
2058 if (vector_len == AVX_512bit) {
2059 evmovdquq(dst, src, AVX_512bit, rscratch);
2060 } else if (vector_len == AVX_256bit) {
2061 vmovdqu(dst, src, rscratch);
2062 } else {
2063 movdqu(dst, src, rscratch);
2064 }
2065 }
2066
2067 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
2068 if (vector_len == AVX_512bit) {
2069 evmovdquq(dst, src, AVX_512bit);
2070 } else if (vector_len == AVX_256bit) {
2071 vmovdqu(dst, src);
2072 } else {
2073 movdqu(dst, src);
2074 }
2075 }
2076
2077 void MacroAssembler::vmovdqu(Address dst, XMMRegister src, int vector_len) {
2078 if (vector_len == AVX_512bit) {
2079 evmovdquq(dst, src, AVX_512bit);
2080 } else if (vector_len == AVX_256bit) {
2081 vmovdqu(dst, src);
2082 } else {
2083 movdqu(dst, src);
2084 }
2085 }
2086
2087 void MacroAssembler::vmovdqu(XMMRegister dst, Address src, int vector_len) {
2088 if (vector_len == AVX_512bit) {
2089 evmovdquq(dst, src, AVX_512bit);
2090 } else if (vector_len == AVX_256bit) {
2091 vmovdqu(dst, src);
2092 } else {
2093 movdqu(dst, src);
2094 }
2095 }
2096
2097 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2098 assert(rscratch != noreg || always_reachable(src), "missing");
2099
2100 if (reachable(src)) {
2101 vmovdqa(dst, as_Address(src));
2102 }
2103 else {
2104 lea(rscratch, src);
2105 vmovdqa(dst, Address(rscratch, 0));
2106 }
2107 }
2108
2109 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2110 assert(rscratch != noreg || always_reachable(src), "missing");
2111
2112 if (vector_len == AVX_512bit) {
2113 evmovdqaq(dst, src, AVX_512bit, rscratch);
2114 } else if (vector_len == AVX_256bit) {
2115 vmovdqa(dst, src, rscratch);
2116 } else {
2117 movdqa(dst, src, rscratch);
2118 }
2119 }
2120
2121 void MacroAssembler::kmov(KRegister dst, Address src) {
2122 if (VM_Version::supports_avx512bw()) {
2123 kmovql(dst, src);
2124 } else {
2125 assert(VM_Version::supports_evex(), "");
2126 kmovwl(dst, src);
2127 }
2128 }
2129
2130 void MacroAssembler::kmov(Address dst, KRegister src) {
2131 if (VM_Version::supports_avx512bw()) {
2132 kmovql(dst, src);
2133 } else {
2134 assert(VM_Version::supports_evex(), "");
2135 kmovwl(dst, src);
2136 }
2137 }
2138
2139 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2140 if (VM_Version::supports_avx512bw()) {
2141 kmovql(dst, src);
2142 } else {
2143 assert(VM_Version::supports_evex(), "");
2144 kmovwl(dst, src);
2145 }
2146 }
2147
2148 void MacroAssembler::kmov(Register dst, KRegister src) {
2149 if (VM_Version::supports_avx512bw()) {
2150 kmovql(dst, src);
2151 } else {
2152 assert(VM_Version::supports_evex(), "");
2153 kmovwl(dst, src);
2154 }
2155 }
2156
2157 void MacroAssembler::kmov(KRegister dst, Register src) {
2158 if (VM_Version::supports_avx512bw()) {
2159 kmovql(dst, src);
2160 } else {
2161 assert(VM_Version::supports_evex(), "");
2162 kmovwl(dst, src);
2163 }
2164 }
2165
2166 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2167 assert(rscratch != noreg || always_reachable(src), "missing");
2168
2169 if (reachable(src)) {
2170 kmovql(dst, as_Address(src));
2171 } else {
2172 lea(rscratch, src);
2173 kmovql(dst, Address(rscratch, 0));
2174 }
2175 }
2176
2177 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2178 assert(rscratch != noreg || always_reachable(src), "missing");
2179
2180 if (reachable(src)) {
2181 kmovwl(dst, as_Address(src));
2182 } else {
2183 lea(rscratch, src);
2184 kmovwl(dst, Address(rscratch, 0));
2185 }
2186 }
2187
2188 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2189 int vector_len, Register rscratch) {
2190 assert(rscratch != noreg || always_reachable(src), "missing");
2191
2192 if (reachable(src)) {
2193 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2194 } else {
2195 lea(rscratch, src);
2196 Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2197 }
2198 }
2199
2200 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2201 int vector_len, Register rscratch) {
2202 assert(rscratch != noreg || always_reachable(src), "missing");
2203
2204 if (reachable(src)) {
2205 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2206 } else {
2207 lea(rscratch, src);
2208 Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2209 }
2210 }
2211
2212 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2213 assert(rscratch != noreg || always_reachable(src), "missing");
2214
2215 if (reachable(src)) {
2216 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2217 } else {
2218 lea(rscratch, src);
2219 Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2220 }
2221 }
2222
2223 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2224 assert(rscratch != noreg || always_reachable(src), "missing");
2225
2226 if (reachable(src)) {
2227 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2228 } else {
2229 lea(rscratch, src);
2230 Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2231 }
2232 }
2233
2234 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2235 assert(rscratch != noreg || always_reachable(src), "missing");
2236
2237 if (reachable(src)) {
2238 Assembler::evmovdquq(dst, as_Address(src), vector_len);
2239 } else {
2240 lea(rscratch, src);
2241 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2242 }
2243 }
2244
2245 void MacroAssembler::evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2246 assert(rscratch != noreg || always_reachable(src), "missing");
2247
2248 if (reachable(src)) {
2249 Assembler::evmovdqaq(dst, mask, as_Address(src), merge, vector_len);
2250 } else {
2251 lea(rscratch, src);
2252 Assembler::evmovdqaq(dst, mask, Address(rscratch, 0), merge, vector_len);
2253 }
2254 }
2255
2256 void MacroAssembler::evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2257 assert(rscratch != noreg || always_reachable(src), "missing");
2258
2259 if (reachable(src)) {
2260 Assembler::evmovdqaq(dst, as_Address(src), vector_len);
2261 } else {
2262 lea(rscratch, src);
2263 Assembler::evmovdqaq(dst, Address(rscratch, 0), vector_len);
2264 }
2265 }
2266
2267 void MacroAssembler::movapd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2268 assert(rscratch != noreg || always_reachable(src), "missing");
2269
2270 if (reachable(src)) {
2271 Assembler::movapd(dst, as_Address(src));
2272 } else {
2273 lea(rscratch, src);
2274 Assembler::movapd(dst, Address(rscratch, 0));
2275 }
2276 }
2277
2278 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2279 assert(rscratch != noreg || always_reachable(src), "missing");
2280
2281 if (reachable(src)) {
2282 Assembler::movdqa(dst, as_Address(src));
2283 } else {
2284 lea(rscratch, src);
2285 Assembler::movdqa(dst, Address(rscratch, 0));
2286 }
2287 }
2288
2289 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2290 assert(rscratch != noreg || always_reachable(src), "missing");
2291
2292 if (reachable(src)) {
2293 Assembler::movsd(dst, as_Address(src));
2294 } else {
2295 lea(rscratch, src);
2296 Assembler::movsd(dst, Address(rscratch, 0));
2297 }
2298 }
2299
2300 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2301 assert(rscratch != noreg || always_reachable(src), "missing");
2302
2303 if (reachable(src)) {
2304 Assembler::movss(dst, as_Address(src));
2305 } else {
2306 lea(rscratch, src);
2307 Assembler::movss(dst, Address(rscratch, 0));
2308 }
2309 }
2310
2311 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2312 assert(rscratch != noreg || always_reachable(src), "missing");
2313
2314 if (reachable(src)) {
2315 Assembler::movddup(dst, as_Address(src));
2316 } else {
2317 lea(rscratch, src);
2318 Assembler::movddup(dst, Address(rscratch, 0));
2319 }
2320 }
2321
2322 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2323 assert(rscratch != noreg || always_reachable(src), "missing");
2324
2325 if (reachable(src)) {
2326 Assembler::vmovddup(dst, as_Address(src), vector_len);
2327 } else {
2328 lea(rscratch, src);
2329 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2330 }
2331 }
2332
2333 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2334 assert(rscratch != noreg || always_reachable(src), "missing");
2335
2336 if (reachable(src)) {
2337 Assembler::mulsd(dst, as_Address(src));
2338 } else {
2339 lea(rscratch, src);
2340 Assembler::mulsd(dst, Address(rscratch, 0));
2341 }
2342 }
2343
2344 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2345 assert(rscratch != noreg || always_reachable(src), "missing");
2346
2347 if (reachable(src)) {
2348 Assembler::mulss(dst, as_Address(src));
2349 } else {
2350 lea(rscratch, src);
2351 Assembler::mulss(dst, Address(rscratch, 0));
2352 }
2353 }
2354
2355 void MacroAssembler::null_check(Register reg, int offset) {
2356 if (needs_explicit_null_check(offset)) {
2357 // provoke OS null exception if reg is null by
2358 // accessing M[reg] w/o changing any (non-CC) registers
2359 // NOTE: cmpl is plenty here to provoke a segv
2360 cmpptr(rax, Address(reg, 0));
2361 // Note: should probably use testl(rax, Address(reg, 0));
2362 // may be shorter code (however, this version of
2363 // testl needs to be implemented first)
2364 } else {
2365 // nothing to do, (later) access of M[reg + offset]
2366 // will provoke OS null exception if reg is null
2367 }
2368 }
2369
2370 void MacroAssembler::test_markword_is_inline_type(Register markword, Label& is_inline_type) {
2371 andptr(markword, markWord::inline_type_mask_in_place);
2372 cmpptr(markword, markWord::inline_type_pattern);
2373 jcc(Assembler::equal, is_inline_type);
2374 }
2375
2376 void MacroAssembler::test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type, bool can_be_null) {
2377 if (can_be_null) {
2378 testptr(object, object);
2379 jcc(Assembler::zero, not_inline_type);
2380 }
2381 const int is_inline_type_mask = markWord::inline_type_pattern;
2382 movptr(tmp, Address(object, oopDesc::mark_offset_in_bytes()));
2383 andptr(tmp, is_inline_type_mask);
2384 cmpptr(tmp, is_inline_type_mask);
2385 jcc(Assembler::notEqual, not_inline_type);
2386 }
2387
2388 void MacroAssembler::test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free_inline_type) {
2389 movl(temp_reg, flags);
2390 testl(temp_reg, 1 << ResolvedFieldEntry::is_null_free_inline_type_shift);
2391 jcc(Assembler::notEqual, is_null_free_inline_type);
2392 }
2393
2394 void MacroAssembler::test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free_inline_type) {
2395 movl(temp_reg, flags);
2396 testl(temp_reg, 1 << ResolvedFieldEntry::is_null_free_inline_type_shift);
2397 jcc(Assembler::equal, not_null_free_inline_type);
2398 }
2399
2400 void MacroAssembler::test_field_is_flat(Register flags, Register temp_reg, Label& is_flat) {
2401 movl(temp_reg, flags);
2402 testl(temp_reg, 1 << ResolvedFieldEntry::is_flat_shift);
2403 jcc(Assembler::notEqual, is_flat);
2404 }
2405
2406 void MacroAssembler::test_field_has_null_marker(Register flags, Register temp_reg, Label& has_null_marker) {
2407 movl(temp_reg, flags);
2408 testl(temp_reg, 1 << ResolvedFieldEntry::has_null_marker_shift);
2409 jcc(Assembler::notEqual, has_null_marker);
2410 }
2411
2412 void MacroAssembler::test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set, Label& jmp_label) {
2413 Label test_mark_word;
2414 // load mark word
2415 movptr(temp_reg, Address(oop, oopDesc::mark_offset_in_bytes()));
2416 // check displaced
2417 testl(temp_reg, markWord::unlocked_value);
2418 jccb(Assembler::notZero, test_mark_word);
2419 // slow path use klass prototype
2420 push(rscratch1);
2421 load_prototype_header(temp_reg, oop, rscratch1);
2422 pop(rscratch1);
2423
2424 bind(test_mark_word);
2425 testl(temp_reg, test_bit);
2426 jcc((jmp_set) ? Assembler::notZero : Assembler::zero, jmp_label);
2427 }
2428
2429 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg,
2430 Label& is_flat_array) {
2431 #ifdef _LP64
2432 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, true, is_flat_array);
2433 #else
2434 load_klass(temp_reg, oop, noreg);
2435 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset()));
2436 test_flat_array_layout(temp_reg, is_flat_array);
2437 #endif
2438 }
2439
2440 void MacroAssembler::test_non_flat_array_oop(Register oop, Register temp_reg,
2441 Label& is_non_flat_array) {
2442 #ifdef _LP64
2443 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, false, is_non_flat_array);
2444 #else
2445 load_klass(temp_reg, oop, noreg);
2446 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset()));
2447 test_non_flat_array_layout(temp_reg, is_non_flat_array);
2448 #endif
2449 }
2450
2451 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label&is_null_free_array) {
2452 #ifdef _LP64
2453 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, true, is_null_free_array);
2454 #else
2455 Unimplemented();
2456 #endif
2457 }
2458
2459 void MacroAssembler::test_non_null_free_array_oop(Register oop, Register temp_reg, Label&is_non_null_free_array) {
2460 #ifdef _LP64
2461 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, false, is_non_null_free_array);
2462 #else
2463 Unimplemented();
2464 #endif
2465 }
2466
2467 void MacroAssembler::test_flat_array_layout(Register lh, Label& is_flat_array) {
2468 testl(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2469 jcc(Assembler::notZero, is_flat_array);
2470 }
2471
2472 void MacroAssembler::test_non_flat_array_layout(Register lh, Label& is_non_flat_array) {
2473 testl(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2474 jcc(Assembler::zero, is_non_flat_array);
2475 }
2476
2477 void MacroAssembler::os_breakpoint() {
2478 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2479 // (e.g., MSVC can't call ps() otherwise)
2480 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2481 }
2482
2483 void MacroAssembler::unimplemented(const char* what) {
2484 const char* buf = nullptr;
2485 {
2486 ResourceMark rm;
2487 stringStream ss;
2488 ss.print("unimplemented: %s", what);
2489 buf = code_string(ss.as_string());
2490 }
2491 stop(buf);
2492 }
2493
2494 #define XSTATE_BV 0x200
2495
2496 void MacroAssembler::pop_CPU_state() {
2497 pop_FPU_state();
2498 pop_IU_state();
2499 }
2500
2501 void MacroAssembler::pop_FPU_state() {
2502 fxrstor(Address(rsp, 0));
2503 addptr(rsp, FPUStateSizeInWords * wordSize);
2504 }
2505
2506 void MacroAssembler::pop_IU_state() {
2507 popa();
2508 addq(rsp, 8);
2509 popf();
2510 }
2511
2512 // Save Integer and Float state
2513 // Warning: Stack must be 16 byte aligned (64bit)
2514 void MacroAssembler::push_CPU_state() {
2515 push_IU_state();
2516 push_FPU_state();
2517 }
2518
2519 void MacroAssembler::push_FPU_state() {
2520 subptr(rsp, FPUStateSizeInWords * wordSize);
2521 fxsave(Address(rsp, 0));
2522 }
2523
2524 void MacroAssembler::push_IU_state() {
2525 // Push flags first because pusha kills them
2526 pushf();
2527 // Make sure rsp stays 16-byte aligned
2528 subq(rsp, 8);
2529 pusha();
2530 }
2531
2532 void MacroAssembler::push_cont_fastpath() {
2533 if (!Continuations::enabled()) return;
2534
2535 Label L_done;
2536 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2537 jccb(Assembler::belowEqual, L_done);
2538 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rsp);
2539 bind(L_done);
2540 }
2541
2542 void MacroAssembler::pop_cont_fastpath() {
2543 if (!Continuations::enabled()) return;
2544
2545 Label L_done;
2546 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2547 jccb(Assembler::below, L_done);
2548 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
2549 bind(L_done);
2550 }
2551
2552 #ifdef ASSERT
2553 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
2554 Label no_cont;
2555 movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
2556 testl(cont, cont);
2557 jcc(Assembler::zero, no_cont);
2558 stop(name);
2559 bind(no_cont);
2560 }
2561 #endif
2562
2563 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { // determine java_thread register
2564 // we must set sp to zero to clear frame
2565 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2566 // must clear fp, so that compiled frames are not confused; it is
2567 // possible that we need it only for debugging
2568 if (clear_fp) {
2569 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2570 }
2571 // Always clear the pc because it could have been set by make_walkable()
2572 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2573 vzeroupper();
2574 }
2575
2576 void MacroAssembler::round_to(Register reg, int modulus) {
2577 addptr(reg, modulus - 1);
2578 andptr(reg, -modulus);
2579 }
2580
2581 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod) {
2582 if (at_return) {
2583 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2584 // we may safely use rsp instead to perform the stack watermark check.
2585 cmpptr(in_nmethod ? rsp : rbp, Address(r15_thread, JavaThread::polling_word_offset()));
2586 jcc(Assembler::above, slow_path);
2587 return;
2588 }
2589 testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2590 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2591 }
2592
2593 // Calls to C land
2594 //
2595 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2596 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2597 // has to be reset to 0. This is required to allow proper stack traversal.
2598 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2599 Register last_java_fp,
2600 address last_java_pc,
2601 Register rscratch) {
2602 vzeroupper();
2603 // determine last_java_sp register
2604 if (!last_java_sp->is_valid()) {
2605 last_java_sp = rsp;
2606 }
2607 // last_java_fp is optional
2608 if (last_java_fp->is_valid()) {
2609 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2610 }
2611 // last_java_pc is optional
2612 if (last_java_pc != nullptr) {
2613 Address java_pc(r15_thread,
2614 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
2615 lea(java_pc, InternalAddress(last_java_pc), rscratch);
2616 }
2617 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2618 }
2619
2620 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2621 Register last_java_fp,
2622 Label &L,
2623 Register scratch) {
2624 lea(scratch, L);
2625 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), scratch);
2626 set_last_Java_frame(last_java_sp, last_java_fp, nullptr, scratch);
2627 }
2628
2629 void MacroAssembler::shlptr(Register dst, int imm8) {
2630 shlq(dst, imm8);
2631 }
2632
2633 void MacroAssembler::shrptr(Register dst, int imm8) {
2634 shrq(dst, imm8);
2635 }
2636
2637 void MacroAssembler::sign_extend_byte(Register reg) {
2638 movsbl(reg, reg); // movsxb
2639 }
2640
2641 void MacroAssembler::sign_extend_short(Register reg) {
2642 movswl(reg, reg); // movsxw
2643 }
2644
2645 void MacroAssembler::testl(Address dst, int32_t imm32) {
2646 if (imm32 >= 0 && is8bit(imm32)) {
2647 testb(dst, imm32);
2648 } else {
2649 Assembler::testl(dst, imm32);
2650 }
2651 }
2652
2653 void MacroAssembler::testl(Register dst, int32_t imm32) {
2654 if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
2655 testb(dst, imm32);
2656 } else {
2657 Assembler::testl(dst, imm32);
2658 }
2659 }
2660
2661 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2662 assert(always_reachable(src), "Address should be reachable");
2663 testl(dst, as_Address(src));
2664 }
2665
2666 void MacroAssembler::testq(Address dst, int32_t imm32) {
2667 if (imm32 >= 0) {
2668 testl(dst, imm32);
2669 } else {
2670 Assembler::testq(dst, imm32);
2671 }
2672 }
2673
2674 void MacroAssembler::testq(Register dst, int32_t imm32) {
2675 if (imm32 >= 0) {
2676 testl(dst, imm32);
2677 } else {
2678 Assembler::testq(dst, imm32);
2679 }
2680 }
2681
2682 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2683 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2684 Assembler::pcmpeqb(dst, src);
2685 }
2686
2687 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2688 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2689 Assembler::pcmpeqw(dst, src);
2690 }
2691
2692 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2693 assert((dst->encoding() < 16),"XMM register should be 0-15");
2694 Assembler::pcmpestri(dst, src, imm8);
2695 }
2696
2697 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2698 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2699 Assembler::pcmpestri(dst, src, imm8);
2700 }
2701
2702 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2703 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2704 Assembler::pmovzxbw(dst, src);
2705 }
2706
2707 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2708 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2709 Assembler::pmovzxbw(dst, src);
2710 }
2711
2712 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2713 assert((src->encoding() < 16),"XMM register should be 0-15");
2714 Assembler::pmovmskb(dst, src);
2715 }
2716
2717 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2718 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2719 Assembler::ptest(dst, src);
2720 }
2721
2722 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2723 assert(rscratch != noreg || always_reachable(src), "missing");
2724
2725 if (reachable(src)) {
2726 Assembler::sqrtss(dst, as_Address(src));
2727 } else {
2728 lea(rscratch, src);
2729 Assembler::sqrtss(dst, Address(rscratch, 0));
2730 }
2731 }
2732
2733 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2734 assert(rscratch != noreg || always_reachable(src), "missing");
2735
2736 if (reachable(src)) {
2737 Assembler::subsd(dst, as_Address(src));
2738 } else {
2739 lea(rscratch, src);
2740 Assembler::subsd(dst, Address(rscratch, 0));
2741 }
2742 }
2743
2744 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
2745 assert(rscratch != noreg || always_reachable(src), "missing");
2746
2747 if (reachable(src)) {
2748 Assembler::roundsd(dst, as_Address(src), rmode);
2749 } else {
2750 lea(rscratch, src);
2751 Assembler::roundsd(dst, Address(rscratch, 0), rmode);
2752 }
2753 }
2754
2755 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2756 assert(rscratch != noreg || always_reachable(src), "missing");
2757
2758 if (reachable(src)) {
2759 Assembler::subss(dst, as_Address(src));
2760 } else {
2761 lea(rscratch, src);
2762 Assembler::subss(dst, Address(rscratch, 0));
2763 }
2764 }
2765
2766 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2767 assert(rscratch != noreg || always_reachable(src), "missing");
2768
2769 if (reachable(src)) {
2770 Assembler::ucomisd(dst, as_Address(src));
2771 } else {
2772 lea(rscratch, src);
2773 Assembler::ucomisd(dst, Address(rscratch, 0));
2774 }
2775 }
2776
2777 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2778 assert(rscratch != noreg || always_reachable(src), "missing");
2779
2780 if (reachable(src)) {
2781 Assembler::ucomiss(dst, as_Address(src));
2782 } else {
2783 lea(rscratch, src);
2784 Assembler::ucomiss(dst, Address(rscratch, 0));
2785 }
2786 }
2787
2788 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2789 assert(rscratch != noreg || always_reachable(src), "missing");
2790
2791 // Used in sign-bit flipping with aligned address.
2792 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2793
2794 if (UseAVX > 2 &&
2795 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2796 (dst->encoding() >= 16)) {
2797 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2798 } else if (reachable(src)) {
2799 Assembler::xorpd(dst, as_Address(src));
2800 } else {
2801 lea(rscratch, src);
2802 Assembler::xorpd(dst, Address(rscratch, 0));
2803 }
2804 }
2805
2806 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
2807 if (UseAVX > 2 &&
2808 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2809 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2810 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2811 } else {
2812 Assembler::xorpd(dst, src);
2813 }
2814 }
2815
2816 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
2817 if (UseAVX > 2 &&
2818 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2819 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2820 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2821 } else {
2822 Assembler::xorps(dst, src);
2823 }
2824 }
2825
2826 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
2827 assert(rscratch != noreg || always_reachable(src), "missing");
2828
2829 // Used in sign-bit flipping with aligned address.
2830 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2831
2832 if (UseAVX > 2 &&
2833 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2834 (dst->encoding() >= 16)) {
2835 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2836 } else if (reachable(src)) {
2837 Assembler::xorps(dst, as_Address(src));
2838 } else {
2839 lea(rscratch, src);
2840 Assembler::xorps(dst, Address(rscratch, 0));
2841 }
2842 }
2843
2844 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
2845 assert(rscratch != noreg || always_reachable(src), "missing");
2846
2847 // Used in sign-bit flipping with aligned address.
2848 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
2849 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
2850 if (reachable(src)) {
2851 Assembler::pshufb(dst, as_Address(src));
2852 } else {
2853 lea(rscratch, src);
2854 Assembler::pshufb(dst, Address(rscratch, 0));
2855 }
2856 }
2857
2858 // AVX 3-operands instructions
2859
2860 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2861 assert(rscratch != noreg || always_reachable(src), "missing");
2862
2863 if (reachable(src)) {
2864 vaddsd(dst, nds, as_Address(src));
2865 } else {
2866 lea(rscratch, src);
2867 vaddsd(dst, nds, Address(rscratch, 0));
2868 }
2869 }
2870
2871 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2872 assert(rscratch != noreg || always_reachable(src), "missing");
2873
2874 if (reachable(src)) {
2875 vaddss(dst, nds, as_Address(src));
2876 } else {
2877 lea(rscratch, src);
2878 vaddss(dst, nds, Address(rscratch, 0));
2879 }
2880 }
2881
2882 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2883 assert(UseAVX > 0, "requires some form of AVX");
2884 assert(rscratch != noreg || always_reachable(src), "missing");
2885
2886 if (reachable(src)) {
2887 Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
2888 } else {
2889 lea(rscratch, src);
2890 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
2891 }
2892 }
2893
2894 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2895 assert(UseAVX > 0, "requires some form of AVX");
2896 assert(rscratch != noreg || always_reachable(src), "missing");
2897
2898 if (reachable(src)) {
2899 Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
2900 } else {
2901 lea(rscratch, src);
2902 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
2903 }
2904 }
2905
2906 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2907 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2908 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2909
2910 vandps(dst, nds, negate_field, vector_len, rscratch);
2911 }
2912
2913 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2914 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2915 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2916
2917 vandpd(dst, nds, negate_field, vector_len, rscratch);
2918 }
2919
2920 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2921 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2922 Assembler::vpaddb(dst, nds, src, vector_len);
2923 }
2924
2925 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2926 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2927 Assembler::vpaddb(dst, nds, src, vector_len);
2928 }
2929
2930 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2931 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2932 Assembler::vpaddw(dst, nds, src, vector_len);
2933 }
2934
2935 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2936 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2937 Assembler::vpaddw(dst, nds, src, vector_len);
2938 }
2939
2940 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2941 assert(rscratch != noreg || always_reachable(src), "missing");
2942
2943 if (reachable(src)) {
2944 Assembler::vpand(dst, nds, as_Address(src), vector_len);
2945 } else {
2946 lea(rscratch, src);
2947 Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
2948 }
2949 }
2950
2951 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2952 assert(rscratch != noreg || always_reachable(src), "missing");
2953
2954 if (reachable(src)) {
2955 Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
2956 } else {
2957 lea(rscratch, src);
2958 Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
2959 }
2960 }
2961
2962 void MacroAssembler::vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2963 assert(rscratch != noreg || always_reachable(src), "missing");
2964
2965 if (reachable(src)) {
2966 Assembler::vbroadcasti128(dst, as_Address(src), vector_len);
2967 } else {
2968 lea(rscratch, src);
2969 Assembler::vbroadcasti128(dst, Address(rscratch, 0), vector_len);
2970 }
2971 }
2972
2973 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2974 assert(rscratch != noreg || always_reachable(src), "missing");
2975
2976 if (reachable(src)) {
2977 Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
2978 } else {
2979 lea(rscratch, src);
2980 Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
2981 }
2982 }
2983
2984 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2985 assert(rscratch != noreg || always_reachable(src), "missing");
2986
2987 if (reachable(src)) {
2988 Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
2989 } else {
2990 lea(rscratch, src);
2991 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
2992 }
2993 }
2994
2995 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2996 assert(rscratch != noreg || always_reachable(src), "missing");
2997
2998 if (reachable(src)) {
2999 Assembler::vbroadcastss(dst, as_Address(src), vector_len);
3000 } else {
3001 lea(rscratch, src);
3002 Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
3003 }
3004 }
3005
3006 // Vector float blend
3007 // vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
3008 void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
3009 // WARN: Allow dst == (src1|src2), mask == scratch
3010 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
3011 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
3012 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst;
3013 bool dst_available = dst != mask && (dst != src1 || dst != src2);
3014 if (blend_emulation && scratch_available && dst_available) {
3015 if (compute_mask) {
3016 vpsrad(scratch, mask, 32, vector_len);
3017 mask = scratch;
3018 }
3019 if (dst == src1) {
3020 vpandn(dst, mask, src1, vector_len); // if mask == 0, src1
3021 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3022 } else {
3023 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
3024 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1
3025 }
3026 vpor(dst, dst, scratch, vector_len);
3027 } else {
3028 Assembler::vblendvps(dst, src1, src2, mask, vector_len);
3029 }
3030 }
3031
3032 // vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
3033 void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
3034 // WARN: Allow dst == (src1|src2), mask == scratch
3035 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
3036 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
3037 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask);
3038 bool dst_available = dst != mask && (dst != src1 || dst != src2);
3039 if (blend_emulation && scratch_available && dst_available) {
3040 if (compute_mask) {
3041 vpxor(scratch, scratch, scratch, vector_len);
3042 vpcmpgtq(scratch, scratch, mask, vector_len);
3043 mask = scratch;
3044 }
3045 if (dst == src1) {
3046 vpandn(dst, mask, src1, vector_len); // if mask == 0, src
3047 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3048 } else {
3049 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
3050 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src
3051 }
3052 vpor(dst, dst, scratch, vector_len);
3053 } else {
3054 Assembler::vblendvpd(dst, src1, src2, mask, vector_len);
3055 }
3056 }
3057
3058 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3059 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3060 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3061 }
3062
3063 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
3064 assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3065 Assembler::vpcmpeqb(dst, src1, src2, vector_len);
3066 }
3067
3068 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3069 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3070 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3071 }
3072
3073 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3074 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3075 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3076 }
3077
3078 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3079 assert(rscratch != noreg || always_reachable(src), "missing");
3080
3081 if (reachable(src)) {
3082 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3083 } else {
3084 lea(rscratch, src);
3085 Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3086 }
3087 }
3088
3089 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3090 int comparison, bool is_signed, int vector_len, Register rscratch) {
3091 assert(rscratch != noreg || always_reachable(src), "missing");
3092
3093 if (reachable(src)) {
3094 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3095 } else {
3096 lea(rscratch, src);
3097 Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3098 }
3099 }
3100
3101 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3102 int comparison, bool is_signed, int vector_len, Register rscratch) {
3103 assert(rscratch != noreg || always_reachable(src), "missing");
3104
3105 if (reachable(src)) {
3106 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3107 } else {
3108 lea(rscratch, src);
3109 Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3110 }
3111 }
3112
3113 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3114 int comparison, bool is_signed, int vector_len, Register rscratch) {
3115 assert(rscratch != noreg || always_reachable(src), "missing");
3116
3117 if (reachable(src)) {
3118 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3119 } else {
3120 lea(rscratch, src);
3121 Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3122 }
3123 }
3124
3125 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3126 int comparison, bool is_signed, int vector_len, Register rscratch) {
3127 assert(rscratch != noreg || always_reachable(src), "missing");
3128
3129 if (reachable(src)) {
3130 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3131 } else {
3132 lea(rscratch, src);
3133 Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3134 }
3135 }
3136
3137 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3138 if (width == Assembler::Q) {
3139 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3140 } else {
3141 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3142 }
3143 }
3144
3145 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3146 int eq_cond_enc = 0x29;
3147 int gt_cond_enc = 0x37;
3148 if (width != Assembler::Q) {
3149 eq_cond_enc = 0x74 + width;
3150 gt_cond_enc = 0x64 + width;
3151 }
3152 switch (cond) {
3153 case eq:
3154 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3155 break;
3156 case neq:
3157 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3158 vallones(xtmp, vector_len);
3159 vpxor(dst, xtmp, dst, vector_len);
3160 break;
3161 case le:
3162 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3163 vallones(xtmp, vector_len);
3164 vpxor(dst, xtmp, dst, vector_len);
3165 break;
3166 case nlt:
3167 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3168 vallones(xtmp, vector_len);
3169 vpxor(dst, xtmp, dst, vector_len);
3170 break;
3171 case lt:
3172 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3173 break;
3174 case nle:
3175 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3176 break;
3177 default:
3178 assert(false, "Should not reach here");
3179 }
3180 }
3181
3182 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3183 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3184 Assembler::vpmovzxbw(dst, src, vector_len);
3185 }
3186
3187 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3188 assert((src->encoding() < 16),"XMM register should be 0-15");
3189 Assembler::vpmovmskb(dst, src, vector_len);
3190 }
3191
3192 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3193 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3194 Assembler::vpmullw(dst, nds, src, vector_len);
3195 }
3196
3197 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3198 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3199 Assembler::vpmullw(dst, nds, src, vector_len);
3200 }
3201
3202 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3203 assert((UseAVX > 0), "AVX support is needed");
3204 assert(rscratch != noreg || always_reachable(src), "missing");
3205
3206 if (reachable(src)) {
3207 Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3208 } else {
3209 lea(rscratch, src);
3210 Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3211 }
3212 }
3213
3214 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3215 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3216 Assembler::vpsubb(dst, nds, src, vector_len);
3217 }
3218
3219 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3220 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3221 Assembler::vpsubb(dst, nds, src, vector_len);
3222 }
3223
3224 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3225 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3226 Assembler::vpsubw(dst, nds, src, vector_len);
3227 }
3228
3229 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3230 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3231 Assembler::vpsubw(dst, nds, src, vector_len);
3232 }
3233
3234 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3235 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3236 Assembler::vpsraw(dst, nds, shift, vector_len);
3237 }
3238
3239 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3240 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3241 Assembler::vpsraw(dst, nds, shift, vector_len);
3242 }
3243
3244 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3245 assert(UseAVX > 2,"");
3246 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3247 vector_len = 2;
3248 }
3249 Assembler::evpsraq(dst, nds, shift, vector_len);
3250 }
3251
3252 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3253 assert(UseAVX > 2,"");
3254 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3255 vector_len = 2;
3256 }
3257 Assembler::evpsraq(dst, nds, shift, vector_len);
3258 }
3259
3260 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3261 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3262 Assembler::vpsrlw(dst, nds, shift, vector_len);
3263 }
3264
3265 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3266 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3267 Assembler::vpsrlw(dst, nds, shift, vector_len);
3268 }
3269
3270 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3271 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3272 Assembler::vpsllw(dst, nds, shift, vector_len);
3273 }
3274
3275 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3276 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3277 Assembler::vpsllw(dst, nds, shift, vector_len);
3278 }
3279
3280 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3281 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3282 Assembler::vptest(dst, src);
3283 }
3284
3285 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3286 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3287 Assembler::punpcklbw(dst, src);
3288 }
3289
3290 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3291 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3292 Assembler::pshufd(dst, src, mode);
3293 }
3294
3295 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3296 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3297 Assembler::pshuflw(dst, src, mode);
3298 }
3299
3300 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3301 assert(rscratch != noreg || always_reachable(src), "missing");
3302
3303 if (reachable(src)) {
3304 vandpd(dst, nds, as_Address(src), vector_len);
3305 } else {
3306 lea(rscratch, src);
3307 vandpd(dst, nds, Address(rscratch, 0), vector_len);
3308 }
3309 }
3310
3311 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3312 assert(rscratch != noreg || always_reachable(src), "missing");
3313
3314 if (reachable(src)) {
3315 vandps(dst, nds, as_Address(src), vector_len);
3316 } else {
3317 lea(rscratch, src);
3318 vandps(dst, nds, Address(rscratch, 0), vector_len);
3319 }
3320 }
3321
3322 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3323 bool merge, int vector_len, Register rscratch) {
3324 assert(rscratch != noreg || always_reachable(src), "missing");
3325
3326 if (reachable(src)) {
3327 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3328 } else {
3329 lea(rscratch, src);
3330 Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3331 }
3332 }
3333
3334 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3335 assert(rscratch != noreg || always_reachable(src), "missing");
3336
3337 if (reachable(src)) {
3338 vdivsd(dst, nds, as_Address(src));
3339 } else {
3340 lea(rscratch, src);
3341 vdivsd(dst, nds, Address(rscratch, 0));
3342 }
3343 }
3344
3345 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3346 assert(rscratch != noreg || always_reachable(src), "missing");
3347
3348 if (reachable(src)) {
3349 vdivss(dst, nds, as_Address(src));
3350 } else {
3351 lea(rscratch, src);
3352 vdivss(dst, nds, Address(rscratch, 0));
3353 }
3354 }
3355
3356 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3357 assert(rscratch != noreg || always_reachable(src), "missing");
3358
3359 if (reachable(src)) {
3360 vmulsd(dst, nds, as_Address(src));
3361 } else {
3362 lea(rscratch, src);
3363 vmulsd(dst, nds, Address(rscratch, 0));
3364 }
3365 }
3366
3367 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3368 assert(rscratch != noreg || always_reachable(src), "missing");
3369
3370 if (reachable(src)) {
3371 vmulss(dst, nds, as_Address(src));
3372 } else {
3373 lea(rscratch, src);
3374 vmulss(dst, nds, Address(rscratch, 0));
3375 }
3376 }
3377
3378 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3379 assert(rscratch != noreg || always_reachable(src), "missing");
3380
3381 if (reachable(src)) {
3382 vsubsd(dst, nds, as_Address(src));
3383 } else {
3384 lea(rscratch, src);
3385 vsubsd(dst, nds, Address(rscratch, 0));
3386 }
3387 }
3388
3389 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3390 assert(rscratch != noreg || always_reachable(src), "missing");
3391
3392 if (reachable(src)) {
3393 vsubss(dst, nds, as_Address(src));
3394 } else {
3395 lea(rscratch, src);
3396 vsubss(dst, nds, Address(rscratch, 0));
3397 }
3398 }
3399
3400 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3401 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3402 assert(rscratch != noreg || always_reachable(src), "missing");
3403
3404 vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3405 }
3406
3407 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3408 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3409 assert(rscratch != noreg || always_reachable(src), "missing");
3410
3411 vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3412 }
3413
3414 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3415 assert(rscratch != noreg || always_reachable(src), "missing");
3416
3417 if (reachable(src)) {
3418 vxorpd(dst, nds, as_Address(src), vector_len);
3419 } else {
3420 lea(rscratch, src);
3421 vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3422 }
3423 }
3424
3425 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3426 assert(rscratch != noreg || always_reachable(src), "missing");
3427
3428 if (reachable(src)) {
3429 vxorps(dst, nds, as_Address(src), vector_len);
3430 } else {
3431 lea(rscratch, src);
3432 vxorps(dst, nds, Address(rscratch, 0), vector_len);
3433 }
3434 }
3435
3436 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3437 assert(rscratch != noreg || always_reachable(src), "missing");
3438
3439 if (UseAVX > 1 || (vector_len < 1)) {
3440 if (reachable(src)) {
3441 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3442 } else {
3443 lea(rscratch, src);
3444 Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3445 }
3446 } else {
3447 MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3448 }
3449 }
3450
3451 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3452 assert(rscratch != noreg || always_reachable(src), "missing");
3453
3454 if (reachable(src)) {
3455 Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3456 } else {
3457 lea(rscratch, src);
3458 Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3459 }
3460 }
3461
3462 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3463 const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3464 STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3465 // The inverted mask is sign-extended
3466 andptr(possibly_non_local, inverted_mask);
3467 }
3468
3469 void MacroAssembler::resolve_jobject(Register value,
3470 Register tmp) {
3471 Register thread = r15_thread;
3472 assert_different_registers(value, thread, tmp);
3473 Label done, tagged, weak_tagged;
3474 testptr(value, value);
3475 jcc(Assembler::zero, done); // Use null as-is.
3476 testptr(value, JNIHandles::tag_mask); // Test for tag.
3477 jcc(Assembler::notZero, tagged);
3478
3479 // Resolve local handle
3480 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp);
3481 verify_oop(value);
3482 jmp(done);
3483
3484 bind(tagged);
3485 testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3486 jcc(Assembler::notZero, weak_tagged);
3487
3488 // Resolve global handle
3489 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3490 verify_oop(value);
3491 jmp(done);
3492
3493 bind(weak_tagged);
3494 // Resolve jweak.
3495 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3496 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp);
3497 verify_oop(value);
3498
3499 bind(done);
3500 }
3501
3502 void MacroAssembler::resolve_global_jobject(Register value,
3503 Register tmp) {
3504 Register thread = r15_thread;
3505 assert_different_registers(value, thread, tmp);
3506 Label done;
3507
3508 testptr(value, value);
3509 jcc(Assembler::zero, done); // Use null as-is.
3510
3511 #ifdef ASSERT
3512 {
3513 Label valid_global_tag;
3514 testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
3515 jcc(Assembler::notZero, valid_global_tag);
3516 stop("non global jobject using resolve_global_jobject");
3517 bind(valid_global_tag);
3518 }
3519 #endif
3520
3521 // Resolve global handle
3522 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3523 verify_oop(value);
3524
3525 bind(done);
3526 }
3527
3528 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3529 subq(dst, imm32);
3530 }
3531
3532 // Force generation of a 4 byte immediate value even if it fits into 8bit
3533 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3534 subq_imm32(dst, imm32);
3535 }
3536
3537 void MacroAssembler::subptr(Register dst, Register src) {
3538 subq(dst, src);
3539 }
3540
3541 // C++ bool manipulation
3542 void MacroAssembler::testbool(Register dst) {
3543 if(sizeof(bool) == 1)
3544 testb(dst, 0xff);
3545 else if(sizeof(bool) == 2) {
3546 // testw implementation needed for two byte bools
3547 ShouldNotReachHere();
3548 } else if(sizeof(bool) == 4)
3549 testl(dst, dst);
3550 else
3551 // unsupported
3552 ShouldNotReachHere();
3553 }
3554
3555 void MacroAssembler::testptr(Register dst, Register src) {
3556 testq(dst, src);
3557 }
3558
3559 // Object / value buffer allocation...
3560 //
3561 // Kills klass and rsi on LP64
3562 void MacroAssembler::allocate_instance(Register klass, Register new_obj,
3563 Register t1, Register t2,
3564 bool clear_fields, Label& alloc_failed)
3565 {
3566 Label done, initialize_header, initialize_object, slow_case, slow_case_no_pop;
3567 Register layout_size = t1;
3568 assert(new_obj == rax, "needs to be rax");
3569 assert_different_registers(klass, new_obj, t1, t2);
3570
3571 // get instance_size in InstanceKlass (scaled to a count of bytes)
3572 movl(layout_size, Address(klass, Klass::layout_helper_offset()));
3573 // test to see if it is malformed in some way
3574 testl(layout_size, Klass::_lh_instance_slow_path_bit);
3575 jcc(Assembler::notZero, slow_case_no_pop);
3576
3577 // Allocate the instance:
3578 // If TLAB is enabled:
3579 // Try to allocate in the TLAB.
3580 // If fails, go to the slow path.
3581 // Else If inline contiguous allocations are enabled:
3582 // Try to allocate in eden.
3583 // If fails due to heap end, go to slow path.
3584 //
3585 // If TLAB is enabled OR inline contiguous is enabled:
3586 // Initialize the allocation.
3587 // Exit.
3588 //
3589 // Go to slow path.
3590
3591 push(klass);
3592 if (UseTLAB) {
3593 tlab_allocate(new_obj, layout_size, 0, klass, t2, slow_case);
3594 if (ZeroTLAB || (!clear_fields)) {
3595 // the fields have been already cleared
3596 jmp(initialize_header);
3597 } else {
3598 // initialize both the header and fields
3599 jmp(initialize_object);
3600 }
3601 } else {
3602 jmp(slow_case);
3603 }
3604
3605 // If UseTLAB is true, the object is created above and there is an initialize need.
3606 // Otherwise, skip and go to the slow path.
3607 if (UseTLAB) {
3608 if (clear_fields) {
3609 // The object is initialized before the header. If the object size is
3610 // zero, go directly to the header initialization.
3611 bind(initialize_object);
3612 if (UseCompactObjectHeaders) {
3613 assert(is_aligned(oopDesc::base_offset_in_bytes(), BytesPerLong), "oop base offset must be 8-byte-aligned");
3614 decrement(layout_size, oopDesc::base_offset_in_bytes());
3615 } else {
3616 decrement(layout_size, sizeof(oopDesc));
3617 }
3618 jcc(Assembler::zero, initialize_header);
3619
3620 // Initialize topmost object field, divide size by 8, check if odd and
3621 // test if zero.
3622 Register zero = klass;
3623 xorl(zero, zero); // use zero reg to clear memory (shorter code)
3624 shrl(layout_size, LogBytesPerLong); // divide by 2*oopSize and set carry flag if odd
3625
3626 #ifdef ASSERT
3627 // make sure instance_size was multiple of 8
3628 Label L;
3629 // Ignore partial flag stall after shrl() since it is debug VM
3630 jcc(Assembler::carryClear, L);
3631 stop("object size is not multiple of 2 - adjust this code");
3632 bind(L);
3633 // must be > 0, no extra check needed here
3634 #endif
3635
3636 // initialize remaining object fields: instance_size was a multiple of 8
3637 {
3638 Label loop;
3639 bind(loop);
3640 int header_size_bytes = oopDesc::header_size() * HeapWordSize;
3641 assert(is_aligned(header_size_bytes, BytesPerLong), "oop header size must be 8-byte-aligned");
3642 movptr(Address(new_obj, layout_size, Address::times_8, header_size_bytes - 1*oopSize), zero);
3643 decrement(layout_size);
3644 jcc(Assembler::notZero, loop);
3645 }
3646 } // clear_fields
3647
3648 // initialize object header only.
3649 bind(initialize_header);
3650 if (UseCompactObjectHeaders || EnableValhalla) {
3651 pop(klass);
3652 Register mark_word = t2;
3653 movptr(mark_word, Address(klass, Klass::prototype_header_offset()));
3654 movptr(Address(new_obj, oopDesc::mark_offset_in_bytes ()), mark_word);
3655 } else {
3656 movptr(Address(new_obj, oopDesc::mark_offset_in_bytes()),
3657 (intptr_t)markWord::prototype().value()); // header
3658 pop(klass); // get saved klass back in the register.
3659 }
3660 if (!UseCompactObjectHeaders) {
3661 xorl(rsi, rsi); // use zero reg to clear memory (shorter code)
3662 store_klass_gap(new_obj, rsi); // zero klass gap for compressed oops
3663 movptr(t2, klass); // preserve klass
3664 store_klass(new_obj, t2, rscratch1); // src klass reg is potentially compressed
3665 }
3666 jmp(done);
3667 }
3668
3669 bind(slow_case);
3670 pop(klass);
3671 bind(slow_case_no_pop);
3672 jmp(alloc_failed);
3673
3674 bind(done);
3675 }
3676
3677 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3678 void MacroAssembler::tlab_allocate(Register obj,
3679 Register var_size_in_bytes,
3680 int con_size_in_bytes,
3681 Register t1,
3682 Register t2,
3683 Label& slow_case) {
3684 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3685 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3686 }
3687
3688 RegSet MacroAssembler::call_clobbered_gp_registers() {
3689 RegSet regs;
3690 regs += RegSet::of(rax, rcx, rdx);
3691 #ifndef _WINDOWS
3692 regs += RegSet::of(rsi, rdi);
3693 #endif
3694 regs += RegSet::range(r8, r11);
3695 if (UseAPX) {
3696 regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1));
3697 }
3698 return regs;
3699 }
3700
3701 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
3702 int num_xmm_registers = XMMRegister::available_xmm_registers();
3703 #if defined(_WINDOWS)
3704 XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
3705 if (num_xmm_registers > 16) {
3706 result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
3707 }
3708 return result;
3709 #else
3710 return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
3711 #endif
3712 }
3713
3714 // C1 only ever uses the first double/float of the XMM register.
3715 static int xmm_save_size() { return sizeof(double); }
3716
3717 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3718 masm->movdbl(Address(rsp, offset), reg);
3719 }
3720
3721 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3722 masm->movdbl(reg, Address(rsp, offset));
3723 }
3724
3725 static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers,
3726 bool save_fpu, int& gp_area_size, int& xmm_area_size) {
3727
3728 gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
3729 StackAlignmentInBytes);
3730 xmm_area_size = save_fpu ? xmm_registers.size() * xmm_save_size() : 0;
3731
3732 return gp_area_size + xmm_area_size;
3733 }
3734
3735 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
3736 block_comment("push_call_clobbered_registers start");
3737 // Regular registers
3738 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
3739
3740 int gp_area_size;
3741 int xmm_area_size;
3742 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
3743 gp_area_size, xmm_area_size);
3744 subptr(rsp, total_save_size);
3745
3746 push_set(gp_registers_to_push, 0);
3747
3748 if (save_fpu) {
3749 push_set(call_clobbered_xmm_registers(), gp_area_size);
3750 }
3751
3752 block_comment("push_call_clobbered_registers end");
3753 }
3754
3755 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
3756 block_comment("pop_call_clobbered_registers start");
3757
3758 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
3759
3760 int gp_area_size;
3761 int xmm_area_size;
3762 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
3763 gp_area_size, xmm_area_size);
3764
3765 if (restore_fpu) {
3766 pop_set(call_clobbered_xmm_registers(), gp_area_size);
3767 }
3768
3769 pop_set(gp_registers_to_pop, 0);
3770
3771 addptr(rsp, total_save_size);
3772
3773 vzeroupper();
3774
3775 block_comment("pop_call_clobbered_registers end");
3776 }
3777
3778 void MacroAssembler::push_set(XMMRegSet set, int offset) {
3779 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
3780 int spill_offset = offset;
3781
3782 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
3783 save_xmm_register(this, spill_offset, *it);
3784 spill_offset += xmm_save_size();
3785 }
3786 }
3787
3788 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
3789 int restore_size = set.size() * xmm_save_size();
3790 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
3791
3792 int restore_offset = offset + restore_size - xmm_save_size();
3793
3794 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
3795 restore_xmm_register(this, restore_offset, *it);
3796 restore_offset -= xmm_save_size();
3797 }
3798 }
3799
3800 void MacroAssembler::push_set(RegSet set, int offset) {
3801 int spill_offset;
3802 if (offset == -1) {
3803 int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3804 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
3805 subptr(rsp, aligned_size);
3806 spill_offset = 0;
3807 } else {
3808 spill_offset = offset;
3809 }
3810
3811 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
3812 movptr(Address(rsp, spill_offset), *it);
3813 spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3814 }
3815 }
3816
3817 void MacroAssembler::pop_set(RegSet set, int offset) {
3818
3819 int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3820 int restore_size = set.size() * gp_reg_size;
3821 int aligned_size = align_up(restore_size, StackAlignmentInBytes);
3822
3823 int restore_offset;
3824 if (offset == -1) {
3825 restore_offset = restore_size - gp_reg_size;
3826 } else {
3827 restore_offset = offset + restore_size - gp_reg_size;
3828 }
3829 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
3830 movptr(*it, Address(rsp, restore_offset));
3831 restore_offset -= gp_reg_size;
3832 }
3833
3834 if (offset == -1) {
3835 addptr(rsp, aligned_size);
3836 }
3837 }
3838
3839 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3840 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3841 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3842 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3843 Label done;
3844
3845 testptr(length_in_bytes, length_in_bytes);
3846 jcc(Assembler::zero, done);
3847
3848 // initialize topmost word, divide index by 2, check if odd and test if zero
3849 // note: for the remaining code to work, index must be a multiple of BytesPerWord
3850 #ifdef ASSERT
3851 {
3852 Label L;
3853 testptr(length_in_bytes, BytesPerWord - 1);
3854 jcc(Assembler::zero, L);
3855 stop("length must be a multiple of BytesPerWord");
3856 bind(L);
3857 }
3858 #endif
3859 Register index = length_in_bytes;
3860 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
3861 if (UseIncDec) {
3862 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
3863 } else {
3864 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
3865 shrptr(index, 1);
3866 }
3867
3868 // initialize remaining object fields: index is a multiple of 2 now
3869 {
3870 Label loop;
3871 bind(loop);
3872 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3873 decrement(index);
3874 jcc(Assembler::notZero, loop);
3875 }
3876
3877 bind(done);
3878 }
3879
3880 void MacroAssembler::get_inline_type_field_klass(Register holder_klass, Register index, Register inline_klass) {
3881 inline_layout_info(holder_klass, index, inline_klass);
3882 movptr(inline_klass, Address(inline_klass, InlineLayoutInfo::klass_offset()));
3883 }
3884
3885 void MacroAssembler::inline_layout_info(Register holder_klass, Register index, Register layout_info) {
3886 movptr(layout_info, Address(holder_klass, InstanceKlass::inline_layout_info_array_offset()));
3887 #ifdef ASSERT
3888 {
3889 Label done;
3890 cmpptr(layout_info, 0);
3891 jcc(Assembler::notEqual, done);
3892 stop("inline_layout_info_array is null");
3893 bind(done);
3894 }
3895 #endif
3896
3897 InlineLayoutInfo array[2];
3898 int size = (char*)&array[1] - (char*)&array[0]; // computing size of array elements
3899 if (is_power_of_2(size)) {
3900 shll(index, log2i_exact(size)); // Scale index by power of 2
3901 } else {
3902 imull(index, index, size); // Scale the index to be the entry index * array_element_size
3903 }
3904 lea(layout_info, Address(layout_info, index, Address::times_1, Array<InlineLayoutInfo>::base_offset_in_bytes()));
3905 }
3906
3907 // Look up the method for a megamorphic invokeinterface call.
3908 // The target method is determined by <intf_klass, itable_index>.
3909 // The receiver klass is in recv_klass.
3910 // On success, the result will be in method_result, and execution falls through.
3911 // On failure, execution transfers to the given label.
3912 void MacroAssembler::lookup_interface_method(Register recv_klass,
3913 Register intf_klass,
3914 RegisterOrConstant itable_index,
3915 Register method_result,
3916 Register scan_temp,
3917 Label& L_no_such_interface,
3918 bool return_method) {
3919 assert_different_registers(recv_klass, intf_klass, scan_temp);
3920 assert_different_registers(method_result, intf_klass, scan_temp);
3921 assert(recv_klass != method_result || !return_method,
3922 "recv_klass can be destroyed when method isn't needed");
3923
3924 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3925 "caller must use same register for non-constant itable index as for method");
3926
3927 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3928 int vtable_base = in_bytes(Klass::vtable_start_offset());
3929 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3930 int scan_step = itableOffsetEntry::size() * wordSize;
3931 int vte_size = vtableEntry::size_in_bytes();
3932 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3933 assert(vte_size == wordSize, "else adjust times_vte_scale");
3934
3935 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3936
3937 // Could store the aligned, prescaled offset in the klass.
3938 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3939
3940 if (return_method) {
3941 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3942 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3943 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3944 }
3945
3946 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
3947 // if (scan->interface() == intf) {
3948 // result = (klass + scan->offset() + itable_index);
3949 // }
3950 // }
3951 Label search, found_method;
3952
3953 for (int peel = 1; peel >= 0; peel--) {
3954 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
3955 cmpptr(intf_klass, method_result);
3956
3957 if (peel) {
3958 jccb(Assembler::equal, found_method);
3959 } else {
3960 jccb(Assembler::notEqual, search);
3961 // (invert the test to fall through to found_method...)
3962 }
3963
3964 if (!peel) break;
3965
3966 bind(search);
3967
3968 // Check that the previous entry is non-null. A null entry means that
3969 // the receiver class doesn't implement the interface, and wasn't the
3970 // same as when the caller was compiled.
3971 testptr(method_result, method_result);
3972 jcc(Assembler::zero, L_no_such_interface);
3973 addptr(scan_temp, scan_step);
3974 }
3975
3976 bind(found_method);
3977
3978 if (return_method) {
3979 // Got a hit.
3980 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
3981 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3982 }
3983 }
3984
3985 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3986 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3987 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3988 // The target method is determined by <holder_klass, itable_index>.
3989 // The receiver klass is in recv_klass.
3990 // On success, the result will be in method_result, and execution falls through.
3991 // On failure, execution transfers to the given label.
3992 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3993 Register holder_klass,
3994 Register resolved_klass,
3995 Register method_result,
3996 Register scan_temp,
3997 Register temp_reg2,
3998 Register receiver,
3999 int itable_index,
4000 Label& L_no_such_interface) {
4001 assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
4002 Register temp_itbl_klass = method_result;
4003 Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
4004
4005 int vtable_base = in_bytes(Klass::vtable_start_offset());
4006 int itentry_off = in_bytes(itableMethodEntry::method_offset());
4007 int scan_step = itableOffsetEntry::size() * wordSize;
4008 int vte_size = vtableEntry::size_in_bytes();
4009 int ioffset = in_bytes(itableOffsetEntry::interface_offset());
4010 int ooffset = in_bytes(itableOffsetEntry::offset_offset());
4011 Address::ScaleFactor times_vte_scale = Address::times_ptr;
4012 assert(vte_size == wordSize, "adjust times_vte_scale");
4013
4014 Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
4015
4016 // temp_itbl_klass = recv_klass.itable[0]
4017 // scan_temp = &recv_klass.itable[0] + step
4018 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4019 movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
4020 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
4021 xorptr(temp_reg, temp_reg);
4022
4023 // Initial checks:
4024 // - if (holder_klass != resolved_klass), go to "scan for resolved"
4025 // - if (itable[0] == 0), no such interface
4026 // - if (itable[0] == holder_klass), shortcut to "holder found"
4027 cmpptr(holder_klass, resolved_klass);
4028 jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
4029 testptr(temp_itbl_klass, temp_itbl_klass);
4030 jccb(Assembler::zero, L_no_such_interface);
4031 cmpptr(holder_klass, temp_itbl_klass);
4032 jccb(Assembler::equal, L_holder_found);
4033
4034 // Loop: Look for holder_klass record in itable
4035 // do {
4036 // tmp = itable[index];
4037 // index += step;
4038 // if (tmp == holder_klass) {
4039 // goto L_holder_found; // Found!
4040 // }
4041 // } while (tmp != 0);
4042 // goto L_no_such_interface // Not found.
4043 Label L_scan_holder;
4044 bind(L_scan_holder);
4045 movptr(temp_itbl_klass, Address(scan_temp, 0));
4046 addptr(scan_temp, scan_step);
4047 cmpptr(holder_klass, temp_itbl_klass);
4048 jccb(Assembler::equal, L_holder_found);
4049 testptr(temp_itbl_klass, temp_itbl_klass);
4050 jccb(Assembler::notZero, L_scan_holder);
4051
4052 jmpb(L_no_such_interface);
4053
4054 // Loop: Look for resolved_class record in itable
4055 // do {
4056 // tmp = itable[index];
4057 // index += step;
4058 // if (tmp == holder_klass) {
4059 // // Also check if we have met a holder klass
4060 // holder_tmp = itable[index-step-ioffset];
4061 // }
4062 // if (tmp == resolved_klass) {
4063 // goto L_resolved_found; // Found!
4064 // }
4065 // } while (tmp != 0);
4066 // goto L_no_such_interface // Not found.
4067 //
4068 Label L_loop_scan_resolved;
4069 bind(L_loop_scan_resolved);
4070 movptr(temp_itbl_klass, Address(scan_temp, 0));
4071 addptr(scan_temp, scan_step);
4072 bind(L_loop_scan_resolved_entry);
4073 cmpptr(holder_klass, temp_itbl_klass);
4074 cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4075 cmpptr(resolved_klass, temp_itbl_klass);
4076 jccb(Assembler::equal, L_resolved_found);
4077 testptr(temp_itbl_klass, temp_itbl_klass);
4078 jccb(Assembler::notZero, L_loop_scan_resolved);
4079
4080 jmpb(L_no_such_interface);
4081
4082 Label L_ready;
4083
4084 // See if we already have a holder klass. If not, go and scan for it.
4085 bind(L_resolved_found);
4086 testptr(temp_reg, temp_reg);
4087 jccb(Assembler::zero, L_scan_holder);
4088 jmpb(L_ready);
4089
4090 bind(L_holder_found);
4091 movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4092
4093 // Finally, temp_reg contains holder_klass vtable offset
4094 bind(L_ready);
4095 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4096 if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
4097 load_klass(scan_temp, receiver, noreg);
4098 movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4099 } else {
4100 movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4101 }
4102 }
4103
4104
4105 // virtual method calling
4106 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4107 RegisterOrConstant vtable_index,
4108 Register method_result) {
4109 const ByteSize base = Klass::vtable_start_offset();
4110 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4111 Address vtable_entry_addr(recv_klass,
4112 vtable_index, Address::times_ptr,
4113 base + vtableEntry::method_offset());
4114 movptr(method_result, vtable_entry_addr);
4115 }
4116
4117
4118 void MacroAssembler::check_klass_subtype(Register sub_klass,
4119 Register super_klass,
4120 Register temp_reg,
4121 Label& L_success) {
4122 Label L_failure;
4123 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, nullptr);
4124 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
4125 bind(L_failure);
4126 }
4127
4128
4129 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4130 Register super_klass,
4131 Register temp_reg,
4132 Label* L_success,
4133 Label* L_failure,
4134 Label* L_slow_path,
4135 RegisterOrConstant super_check_offset) {
4136 assert_different_registers(sub_klass, super_klass, temp_reg);
4137 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4138 if (super_check_offset.is_register()) {
4139 assert_different_registers(sub_klass, super_klass,
4140 super_check_offset.as_register());
4141 } else if (must_load_sco) {
4142 assert(temp_reg != noreg, "supply either a temp or a register offset");
4143 }
4144
4145 Label L_fallthrough;
4146 int label_nulls = 0;
4147 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4148 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4149 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4150 assert(label_nulls <= 1, "at most one null in the batch");
4151
4152 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4153 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4154 Address super_check_offset_addr(super_klass, sco_offset);
4155
4156 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4157 // range of a jccb. If this routine grows larger, reconsider at
4158 // least some of these.
4159 #define local_jcc(assembler_cond, label) \
4160 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
4161 else jcc( assembler_cond, label) /*omit semi*/
4162
4163 // Hacked jmp, which may only be used just before L_fallthrough.
4164 #define final_jmp(label) \
4165 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4166 else jmp(label) /*omit semi*/
4167
4168 // If the pointers are equal, we are done (e.g., String[] elements).
4169 // This self-check enables sharing of secondary supertype arrays among
4170 // non-primary types such as array-of-interface. Otherwise, each such
4171 // type would need its own customized SSA.
4172 // We move this check to the front of the fast path because many
4173 // type checks are in fact trivially successful in this manner,
4174 // so we get a nicely predicted branch right at the start of the check.
4175 cmpptr(sub_klass, super_klass);
4176 local_jcc(Assembler::equal, *L_success);
4177
4178 // Check the supertype display:
4179 if (must_load_sco) {
4180 // Positive movl does right thing on LP64.
4181 movl(temp_reg, super_check_offset_addr);
4182 super_check_offset = RegisterOrConstant(temp_reg);
4183 }
4184 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4185 cmpptr(super_klass, super_check_addr); // load displayed supertype
4186
4187 // This check has worked decisively for primary supers.
4188 // Secondary supers are sought in the super_cache ('super_cache_addr').
4189 // (Secondary supers are interfaces and very deeply nested subtypes.)
4190 // This works in the same check above because of a tricky aliasing
4191 // between the super_cache and the primary super display elements.
4192 // (The 'super_check_addr' can address either, as the case requires.)
4193 // Note that the cache is updated below if it does not help us find
4194 // what we need immediately.
4195 // So if it was a primary super, we can just fail immediately.
4196 // Otherwise, it's the slow path for us (no success at this point).
4197
4198 if (super_check_offset.is_register()) {
4199 local_jcc(Assembler::equal, *L_success);
4200 cmpl(super_check_offset.as_register(), sc_offset);
4201 if (L_failure == &L_fallthrough) {
4202 local_jcc(Assembler::equal, *L_slow_path);
4203 } else {
4204 local_jcc(Assembler::notEqual, *L_failure);
4205 final_jmp(*L_slow_path);
4206 }
4207 } else if (super_check_offset.as_constant() == sc_offset) {
4208 // Need a slow path; fast failure is impossible.
4209 if (L_slow_path == &L_fallthrough) {
4210 local_jcc(Assembler::equal, *L_success);
4211 } else {
4212 local_jcc(Assembler::notEqual, *L_slow_path);
4213 final_jmp(*L_success);
4214 }
4215 } else {
4216 // No slow path; it's a fast decision.
4217 if (L_failure == &L_fallthrough) {
4218 local_jcc(Assembler::equal, *L_success);
4219 } else {
4220 local_jcc(Assembler::notEqual, *L_failure);
4221 final_jmp(*L_success);
4222 }
4223 }
4224
4225 bind(L_fallthrough);
4226
4227 #undef local_jcc
4228 #undef final_jmp
4229 }
4230
4231
4232 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4233 Register super_klass,
4234 Register temp_reg,
4235 Register temp2_reg,
4236 Label* L_success,
4237 Label* L_failure,
4238 bool set_cond_codes) {
4239 assert_different_registers(sub_klass, super_klass, temp_reg);
4240 if (temp2_reg != noreg)
4241 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4242 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4243
4244 Label L_fallthrough;
4245 int label_nulls = 0;
4246 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4247 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4248 assert(label_nulls <= 1, "at most one null in the batch");
4249
4250 // a couple of useful fields in sub_klass:
4251 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4252 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4253 Address secondary_supers_addr(sub_klass, ss_offset);
4254 Address super_cache_addr( sub_klass, sc_offset);
4255
4256 // Do a linear scan of the secondary super-klass chain.
4257 // This code is rarely used, so simplicity is a virtue here.
4258 // The repne_scan instruction uses fixed registers, which we must spill.
4259 // Don't worry too much about pre-existing connections with the input regs.
4260
4261 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4262 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4263
4264 // Get super_klass value into rax (even if it was in rdi or rcx).
4265 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4266 if (super_klass != rax) {
4267 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4268 mov(rax, super_klass);
4269 }
4270 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4271 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4272
4273 #ifndef PRODUCT
4274 uint* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4275 ExternalAddress pst_counter_addr((address) pst_counter);
4276 lea(rcx, pst_counter_addr);
4277 incrementl(Address(rcx, 0));
4278 #endif //PRODUCT
4279
4280 // We will consult the secondary-super array.
4281 movptr(rdi, secondary_supers_addr);
4282 // Load the array length. (Positive movl does right thing on LP64.)
4283 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4284 // Skip to start of data.
4285 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4286
4287 // Scan RCX words at [RDI] for an occurrence of RAX.
4288 // Set NZ/Z based on last compare.
4289 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4290 // not change flags (only scas instruction which is repeated sets flags).
4291 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4292
4293 testptr(rax,rax); // Set Z = 0
4294 repne_scan();
4295
4296 // Unspill the temp. registers:
4297 if (pushed_rdi) pop(rdi);
4298 if (pushed_rcx) pop(rcx);
4299 if (pushed_rax) pop(rax);
4300
4301 if (set_cond_codes) {
4302 // Special hack for the AD files: rdi is guaranteed non-zero.
4303 assert(!pushed_rdi, "rdi must be left non-null");
4304 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4305 }
4306
4307 if (L_failure == &L_fallthrough)
4308 jccb(Assembler::notEqual, *L_failure);
4309 else jcc(Assembler::notEqual, *L_failure);
4310
4311 // Success. Cache the super we found and proceed in triumph.
4312 movptr(super_cache_addr, super_klass);
4313
4314 if (L_success != &L_fallthrough) {
4315 jmp(*L_success);
4316 }
4317
4318 #undef IS_A_TEMP
4319
4320 bind(L_fallthrough);
4321 }
4322
4323 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4324 Register super_klass,
4325 Register temp_reg,
4326 Register temp2_reg,
4327 Label* L_success,
4328 Label* L_failure,
4329 bool set_cond_codes) {
4330 assert(set_cond_codes == false, "must be false on 64-bit x86");
4331 check_klass_subtype_slow_path
4332 (sub_klass, super_klass, temp_reg, temp2_reg, noreg, noreg,
4333 L_success, L_failure);
4334 }
4335
4336 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4337 Register super_klass,
4338 Register temp_reg,
4339 Register temp2_reg,
4340 Register temp3_reg,
4341 Register temp4_reg,
4342 Label* L_success,
4343 Label* L_failure) {
4344 if (UseSecondarySupersTable) {
4345 check_klass_subtype_slow_path_table
4346 (sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, temp4_reg,
4347 L_success, L_failure);
4348 } else {
4349 check_klass_subtype_slow_path_linear
4350 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, /*set_cond_codes*/false);
4351 }
4352 }
4353
4354 Register MacroAssembler::allocate_if_noreg(Register r,
4355 RegSetIterator<Register> &available_regs,
4356 RegSet ®s_to_push) {
4357 if (!r->is_valid()) {
4358 r = *available_regs++;
4359 regs_to_push += r;
4360 }
4361 return r;
4362 }
4363
4364 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4365 Register super_klass,
4366 Register temp_reg,
4367 Register temp2_reg,
4368 Register temp3_reg,
4369 Register result_reg,
4370 Label* L_success,
4371 Label* L_failure) {
4372 // NB! Callers may assume that, when temp2_reg is a valid register,
4373 // this code sets it to a nonzero value.
4374 bool temp2_reg_was_valid = temp2_reg->is_valid();
4375
4376 RegSet temps = RegSet::of(temp_reg, temp2_reg, temp3_reg);
4377
4378 Label L_fallthrough;
4379 int label_nulls = 0;
4380 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4381 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4382 assert(label_nulls <= 1, "at most one null in the batch");
4383
4384 BLOCK_COMMENT("check_klass_subtype_slow_path_table");
4385
4386 RegSetIterator<Register> available_regs
4387 = (RegSet::of(rax, rcx, rdx, r8) + r9 + r10 + r11 + r12 - temps - sub_klass - super_klass).begin();
4388
4389 RegSet pushed_regs;
4390
4391 temp_reg = allocate_if_noreg(temp_reg, available_regs, pushed_regs);
4392 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
4393 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
4394 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4395 Register temp4_reg = allocate_if_noreg(noreg, available_regs, pushed_regs);
4396
4397 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, result_reg);
4398
4399 {
4400
4401 int register_push_size = pushed_regs.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4402 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4403 subptr(rsp, aligned_size);
4404 push_set(pushed_regs, 0);
4405
4406 lookup_secondary_supers_table_var(sub_klass,
4407 super_klass,
4408 temp_reg, temp2_reg, temp3_reg, temp4_reg, result_reg);
4409 cmpq(result_reg, 0);
4410
4411 // Unspill the temp. registers:
4412 pop_set(pushed_regs, 0);
4413 // Increment SP but do not clobber flags.
4414 lea(rsp, Address(rsp, aligned_size));
4415 }
4416
4417 if (temp2_reg_was_valid) {
4418 movq(temp2_reg, 1);
4419 }
4420
4421 jcc(Assembler::notEqual, *L_failure);
4422
4423 if (L_success != &L_fallthrough) {
4424 jmp(*L_success);
4425 }
4426
4427 bind(L_fallthrough);
4428 }
4429
4430 // population_count variant for running without the POPCNT
4431 // instruction, which was introduced with SSE4.2 in 2008.
4432 void MacroAssembler::population_count(Register dst, Register src,
4433 Register scratch1, Register scratch2) {
4434 assert_different_registers(src, scratch1, scratch2);
4435 if (UsePopCountInstruction) {
4436 Assembler::popcntq(dst, src);
4437 } else {
4438 assert_different_registers(src, scratch1, scratch2);
4439 assert_different_registers(dst, scratch1, scratch2);
4440 Label loop, done;
4441
4442 mov(scratch1, src);
4443 // dst = 0;
4444 // while(scratch1 != 0) {
4445 // dst++;
4446 // scratch1 &= (scratch1 - 1);
4447 // }
4448 xorl(dst, dst);
4449 testq(scratch1, scratch1);
4450 jccb(Assembler::equal, done);
4451 {
4452 bind(loop);
4453 incq(dst);
4454 movq(scratch2, scratch1);
4455 decq(scratch2);
4456 andq(scratch1, scratch2);
4457 jccb(Assembler::notEqual, loop);
4458 }
4459 bind(done);
4460 }
4461 #ifdef ASSERT
4462 mov64(scratch1, 0xCafeBabeDeadBeef);
4463 movq(scratch2, scratch1);
4464 #endif
4465 }
4466
4467 // Ensure that the inline code and the stub are using the same registers.
4468 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
4469 do { \
4470 assert(r_super_klass == rax, "mismatch"); \
4471 assert(r_array_base == rbx, "mismatch"); \
4472 assert(r_array_length == rcx, "mismatch"); \
4473 assert(r_array_index == rdx, "mismatch"); \
4474 assert(r_sub_klass == rsi || r_sub_klass == noreg, "mismatch"); \
4475 assert(r_bitmap == r11 || r_bitmap == noreg, "mismatch"); \
4476 assert(result == rdi || result == noreg, "mismatch"); \
4477 } while(0)
4478
4479 // Versions of salq and rorq that don't need count to be in rcx
4480
4481 void MacroAssembler::salq(Register dest, Register count) {
4482 if (count == rcx) {
4483 Assembler::salq(dest);
4484 } else {
4485 assert_different_registers(rcx, dest);
4486 xchgq(rcx, count);
4487 Assembler::salq(dest);
4488 xchgq(rcx, count);
4489 }
4490 }
4491
4492 void MacroAssembler::rorq(Register dest, Register count) {
4493 if (count == rcx) {
4494 Assembler::rorq(dest);
4495 } else {
4496 assert_different_registers(rcx, dest);
4497 xchgq(rcx, count);
4498 Assembler::rorq(dest);
4499 xchgq(rcx, count);
4500 }
4501 }
4502
4503 // Return true: we succeeded in generating this code
4504 //
4505 // At runtime, return 0 in result if r_super_klass is a superclass of
4506 // r_sub_klass, otherwise return nonzero. Use this if you know the
4507 // super_klass_slot of the class you're looking for. This is always
4508 // the case for instanceof and checkcast.
4509 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4510 Register r_super_klass,
4511 Register temp1,
4512 Register temp2,
4513 Register temp3,
4514 Register temp4,
4515 Register result,
4516 u1 super_klass_slot) {
4517 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4518
4519 Label L_fallthrough, L_success, L_failure;
4520
4521 BLOCK_COMMENT("lookup_secondary_supers_table {");
4522
4523 const Register
4524 r_array_index = temp1,
4525 r_array_length = temp2,
4526 r_array_base = temp3,
4527 r_bitmap = temp4;
4528
4529 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4530
4531 xorq(result, result); // = 0
4532
4533 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4534 movq(r_array_index, r_bitmap);
4535
4536 // First check the bitmap to see if super_klass might be present. If
4537 // the bit is zero, we are certain that super_klass is not one of
4538 // the secondary supers.
4539 u1 bit = super_klass_slot;
4540 {
4541 // NB: If the count in a x86 shift instruction is 0, the flags are
4542 // not affected, so we do a testq instead.
4543 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
4544 if (shift_count != 0) {
4545 salq(r_array_index, shift_count);
4546 } else {
4547 testq(r_array_index, r_array_index);
4548 }
4549 }
4550 // We test the MSB of r_array_index, i.e. its sign bit
4551 jcc(Assembler::positive, L_failure);
4552
4553 // Get the first array index that can contain super_klass into r_array_index.
4554 if (bit != 0) {
4555 population_count(r_array_index, r_array_index, temp2, temp3);
4556 } else {
4557 movl(r_array_index, 1);
4558 }
4559 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4560
4561 // We will consult the secondary-super array.
4562 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4563
4564 // We're asserting that the first word in an Array<Klass*> is the
4565 // length, and the second word is the first word of the data. If
4566 // that ever changes, r_array_base will have to be adjusted here.
4567 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4568 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4569
4570 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4571 jccb(Assembler::equal, L_success);
4572
4573 // Is there another entry to check? Consult the bitmap.
4574 btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4575 jccb(Assembler::carryClear, L_failure);
4576
4577 // Linear probe. Rotate the bitmap so that the next bit to test is
4578 // in Bit 1.
4579 if (bit != 0) {
4580 rorq(r_bitmap, bit);
4581 }
4582
4583 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4584 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4585 // Kills: r_array_length.
4586 // Returns: result.
4587 call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub()));
4588 // Result (0/1) is in rdi
4589 jmpb(L_fallthrough);
4590
4591 bind(L_failure);
4592 incq(result); // 0 => 1
4593
4594 bind(L_success);
4595 // result = 0;
4596
4597 bind(L_fallthrough);
4598 BLOCK_COMMENT("} lookup_secondary_supers_table");
4599
4600 if (VerifySecondarySupers) {
4601 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4602 temp1, temp2, temp3);
4603 }
4604 }
4605
4606 // At runtime, return 0 in result if r_super_klass is a superclass of
4607 // r_sub_klass, otherwise return nonzero. Use this version of
4608 // lookup_secondary_supers_table() if you don't know ahead of time
4609 // which superclass will be searched for. Used by interpreter and
4610 // runtime stubs. It is larger and has somewhat greater latency than
4611 // the version above, which takes a constant super_klass_slot.
4612 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4613 Register r_super_klass,
4614 Register temp1,
4615 Register temp2,
4616 Register temp3,
4617 Register temp4,
4618 Register result) {
4619 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4620 assert_different_registers(r_sub_klass, r_super_klass, rcx);
4621 RegSet temps = RegSet::of(temp1, temp2, temp3, temp4);
4622
4623 Label L_fallthrough, L_success, L_failure;
4624
4625 BLOCK_COMMENT("lookup_secondary_supers_table {");
4626
4627 RegSetIterator<Register> available_regs = (temps - rcx).begin();
4628
4629 // FIXME. Once we are sure that all paths reaching this point really
4630 // do pass rcx as one of our temps we can get rid of the following
4631 // workaround.
4632 assert(temps.contains(rcx), "fix this code");
4633
4634 // We prefer to have our shift count in rcx. If rcx is one of our
4635 // temps, use it for slot. If not, pick any of our temps.
4636 Register slot;
4637 if (!temps.contains(rcx)) {
4638 slot = *available_regs++;
4639 } else {
4640 slot = rcx;
4641 }
4642
4643 const Register r_array_index = *available_regs++;
4644 const Register r_bitmap = *available_regs++;
4645
4646 // The logic above guarantees this property, but we state it here.
4647 assert_different_registers(r_array_index, r_bitmap, rcx);
4648
4649 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4650 movq(r_array_index, r_bitmap);
4651
4652 // First check the bitmap to see if super_klass might be present. If
4653 // the bit is zero, we are certain that super_klass is not one of
4654 // the secondary supers.
4655 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4656 xorl(slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1)); // slot ^ 63 === 63 - slot (mod 64)
4657 salq(r_array_index, slot);
4658
4659 testq(r_array_index, r_array_index);
4660 // We test the MSB of r_array_index, i.e. its sign bit
4661 jcc(Assembler::positive, L_failure);
4662
4663 const Register r_array_base = *available_regs++;
4664
4665 // Get the first array index that can contain super_klass into r_array_index.
4666 // Note: Clobbers r_array_base and slot.
4667 population_count(r_array_index, r_array_index, /*temp2*/r_array_base, /*temp3*/slot);
4668
4669 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4670
4671 // We will consult the secondary-super array.
4672 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4673
4674 // We're asserting that the first word in an Array<Klass*> is the
4675 // length, and the second word is the first word of the data. If
4676 // that ever changes, r_array_base will have to be adjusted here.
4677 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4678 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4679
4680 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4681 jccb(Assembler::equal, L_success);
4682
4683 // Restore slot to its true value
4684 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4685
4686 // Linear probe. Rotate the bitmap so that the next bit to test is
4687 // in Bit 1.
4688 rorq(r_bitmap, slot);
4689
4690 // Is there another entry to check? Consult the bitmap.
4691 btq(r_bitmap, 1);
4692 jccb(Assembler::carryClear, L_failure);
4693
4694 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4695 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4696 // Kills: r_array_length.
4697 // Returns: result.
4698 lookup_secondary_supers_table_slow_path(r_super_klass,
4699 r_array_base,
4700 r_array_index,
4701 r_bitmap,
4702 /*temp1*/result,
4703 /*temp2*/slot,
4704 &L_success,
4705 nullptr);
4706
4707 bind(L_failure);
4708 movq(result, 1);
4709 jmpb(L_fallthrough);
4710
4711 bind(L_success);
4712 xorq(result, result); // = 0
4713
4714 bind(L_fallthrough);
4715 BLOCK_COMMENT("} lookup_secondary_supers_table");
4716
4717 if (VerifySecondarySupers) {
4718 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4719 temp1, temp2, temp3);
4720 }
4721 }
4722
4723 void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit,
4724 Label* L_success, Label* L_failure) {
4725 Label L_loop, L_fallthrough;
4726 {
4727 int label_nulls = 0;
4728 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4729 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4730 assert(label_nulls <= 1, "at most one null in the batch");
4731 }
4732 bind(L_loop);
4733 cmpq(value, Address(addr, count, Address::times_8));
4734 jcc(Assembler::equal, *L_success);
4735 addl(count, 1);
4736 cmpl(count, limit);
4737 jcc(Assembler::less, L_loop);
4738
4739 if (&L_fallthrough != L_failure) {
4740 jmp(*L_failure);
4741 }
4742 bind(L_fallthrough);
4743 }
4744
4745 // Called by code generated by check_klass_subtype_slow_path
4746 // above. This is called when there is a collision in the hashed
4747 // lookup in the secondary supers array.
4748 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4749 Register r_array_base,
4750 Register r_array_index,
4751 Register r_bitmap,
4752 Register temp1,
4753 Register temp2,
4754 Label* L_success,
4755 Label* L_failure) {
4756 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2);
4757
4758 const Register
4759 r_array_length = temp1,
4760 r_sub_klass = noreg,
4761 result = noreg;
4762
4763 Label L_fallthrough;
4764 int label_nulls = 0;
4765 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4766 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4767 assert(label_nulls <= 1, "at most one null in the batch");
4768
4769 // Load the array length.
4770 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4771 // And adjust the array base to point to the data.
4772 // NB! Effectively increments current slot index by 1.
4773 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4774 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4775
4776 // Linear probe
4777 Label L_huge;
4778
4779 // The bitmap is full to bursting.
4780 // Implicit invariant: BITMAP_FULL implies (length > 0)
4781 cmpl(r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4782 jcc(Assembler::greater, L_huge);
4783
4784 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4785 // current slot (at secondary_supers[r_array_index]) has not yet
4786 // been inspected, and r_array_index may be out of bounds if we
4787 // wrapped around the end of the array.
4788
4789 { // This is conventional linear probing, but instead of terminating
4790 // when a null entry is found in the table, we maintain a bitmap
4791 // in which a 0 indicates missing entries.
4792 // The check above guarantees there are 0s in the bitmap, so the loop
4793 // eventually terminates.
4794
4795 xorl(temp2, temp2); // = 0;
4796
4797 Label L_again;
4798 bind(L_again);
4799
4800 // Check for array wraparound.
4801 cmpl(r_array_index, r_array_length);
4802 cmovl(Assembler::greaterEqual, r_array_index, temp2);
4803
4804 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4805 jcc(Assembler::equal, *L_success);
4806
4807 // If the next bit in bitmap is zero, we're done.
4808 btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now
4809 jcc(Assembler::carryClear, *L_failure);
4810
4811 rorq(r_bitmap, 1); // Bits 1/2 => 0/1
4812 addl(r_array_index, 1);
4813
4814 jmp(L_again);
4815 }
4816
4817 { // Degenerate case: more than 64 secondary supers.
4818 // FIXME: We could do something smarter here, maybe a vectorized
4819 // comparison or a binary search, but is that worth any added
4820 // complexity?
4821 bind(L_huge);
4822 xorl(r_array_index, r_array_index); // = 0
4823 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length,
4824 L_success,
4825 (&L_fallthrough != L_failure ? L_failure : nullptr));
4826
4827 bind(L_fallthrough);
4828 }
4829 }
4830
4831 struct VerifyHelperArguments {
4832 Klass* _super;
4833 Klass* _sub;
4834 intptr_t _linear_result;
4835 intptr_t _table_result;
4836 };
4837
4838 static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) {
4839 Klass::on_secondary_supers_verification_failure(args->_super,
4840 args->_sub,
4841 args->_linear_result,
4842 args->_table_result,
4843 msg);
4844 }
4845
4846 // Make sure that the hashed lookup and a linear scan agree.
4847 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4848 Register r_super_klass,
4849 Register result,
4850 Register temp1,
4851 Register temp2,
4852 Register temp3) {
4853 const Register
4854 r_array_index = temp1,
4855 r_array_length = temp2,
4856 r_array_base = temp3,
4857 r_bitmap = noreg;
4858
4859 BLOCK_COMMENT("verify_secondary_supers_table {");
4860
4861 Label L_success, L_failure, L_check, L_done;
4862
4863 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4864 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4865 // And adjust the array base to point to the data.
4866 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4867
4868 testl(r_array_length, r_array_length); // array_length == 0?
4869 jcc(Assembler::zero, L_failure);
4870
4871 movl(r_array_index, 0);
4872 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success);
4873 // fall through to L_failure
4874
4875 const Register linear_result = r_array_index; // reuse temp1
4876
4877 bind(L_failure); // not present
4878 movl(linear_result, 1);
4879 jmp(L_check);
4880
4881 bind(L_success); // present
4882 movl(linear_result, 0);
4883
4884 bind(L_check);
4885 cmpl(linear_result, result);
4886 jcc(Assembler::equal, L_done);
4887
4888 { // To avoid calling convention issues, build a record on the stack
4889 // and pass the pointer to that instead.
4890 push(result);
4891 push(linear_result);
4892 push(r_sub_klass);
4893 push(r_super_klass);
4894 movptr(c_rarg1, rsp);
4895 movptr(c_rarg0, (uintptr_t) "mismatch");
4896 call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper)));
4897 should_not_reach_here();
4898 }
4899 bind(L_done);
4900
4901 BLOCK_COMMENT("} verify_secondary_supers_table");
4902 }
4903
4904 #undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS
4905
4906 void MacroAssembler::clinit_barrier(Register klass, Label* L_fast_path, Label* L_slow_path) {
4907 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
4908
4909 Label L_fallthrough;
4910 if (L_fast_path == nullptr) {
4911 L_fast_path = &L_fallthrough;
4912 } else if (L_slow_path == nullptr) {
4913 L_slow_path = &L_fallthrough;
4914 }
4915
4916 // Fast path check: class is fully initialized.
4917 // init_state needs acquire, but x86 is TSO, and so we are already good.
4918 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4919 jcc(Assembler::equal, *L_fast_path);
4920
4921 // Fast path check: current thread is initializer thread
4922 cmpptr(r15_thread, Address(klass, InstanceKlass::init_thread_offset()));
4923 if (L_slow_path == &L_fallthrough) {
4924 jcc(Assembler::equal, *L_fast_path);
4925 bind(*L_slow_path);
4926 } else if (L_fast_path == &L_fallthrough) {
4927 jcc(Assembler::notEqual, *L_slow_path);
4928 bind(*L_fast_path);
4929 } else {
4930 Unimplemented();
4931 }
4932 }
4933
4934 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4935 if (VM_Version::supports_cmov()) {
4936 cmovl(cc, dst, src);
4937 } else {
4938 Label L;
4939 jccb(negate_condition(cc), L);
4940 movl(dst, src);
4941 bind(L);
4942 }
4943 }
4944
4945 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4946 if (VM_Version::supports_cmov()) {
4947 cmovl(cc, dst, src);
4948 } else {
4949 Label L;
4950 jccb(negate_condition(cc), L);
4951 movl(dst, src);
4952 bind(L);
4953 }
4954 }
4955
4956 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4957 if (!VerifyOops || VerifyAdapterSharing) {
4958 // Below address of the code string confuses VerifyAdapterSharing
4959 // because it may differ between otherwise equivalent adapters.
4960 return;
4961 }
4962
4963 BLOCK_COMMENT("verify_oop {");
4964 push(rscratch1);
4965 push(rax); // save rax
4966 push(reg); // pass register argument
4967
4968 // Pass register number to verify_oop_subroutine
4969 const char* b = nullptr;
4970 {
4971 ResourceMark rm;
4972 stringStream ss;
4973 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4974 b = code_string(ss.as_string());
4975 }
4976 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
4977 pushptr(buffer.addr(), rscratch1);
4978
4979 // call indirectly to solve generation ordering problem
4980 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4981 call(rax);
4982 // Caller pops the arguments (oop, message) and restores rax, r10
4983 BLOCK_COMMENT("} verify_oop");
4984 }
4985
4986 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4987 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4988 // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
4989 // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
4990 vpternlogd(dst, 0xFF, dst, dst, vector_len);
4991 } else if (VM_Version::supports_avx()) {
4992 vpcmpeqd(dst, dst, dst, vector_len);
4993 } else {
4994 pcmpeqd(dst, dst);
4995 }
4996 }
4997
4998 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4999 int extra_slot_offset) {
5000 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
5001 int stackElementSize = Interpreter::stackElementSize;
5002 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
5003 #ifdef ASSERT
5004 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
5005 assert(offset1 - offset == stackElementSize, "correct arithmetic");
5006 #endif
5007 Register scale_reg = noreg;
5008 Address::ScaleFactor scale_factor = Address::no_scale;
5009 if (arg_slot.is_constant()) {
5010 offset += arg_slot.as_constant() * stackElementSize;
5011 } else {
5012 scale_reg = arg_slot.as_register();
5013 scale_factor = Address::times(stackElementSize);
5014 }
5015 offset += wordSize; // return PC is on stack
5016 return Address(rsp, scale_reg, scale_factor, offset);
5017 }
5018
5019 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
5020 if (!VerifyOops || VerifyAdapterSharing) {
5021 // Below address of the code string confuses VerifyAdapterSharing
5022 // because it may differ between otherwise equivalent adapters.
5023 return;
5024 }
5025
5026 push(rscratch1);
5027 push(rax); // save rax,
5028 // addr may contain rsp so we will have to adjust it based on the push
5029 // we just did (and on 64 bit we do two pushes)
5030 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5031 // stores rax into addr which is backwards of what was intended.
5032 if (addr.uses(rsp)) {
5033 lea(rax, addr);
5034 pushptr(Address(rax, 2 * BytesPerWord));
5035 } else {
5036 pushptr(addr);
5037 }
5038
5039 // Pass register number to verify_oop_subroutine
5040 const char* b = nullptr;
5041 {
5042 ResourceMark rm;
5043 stringStream ss;
5044 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
5045 b = code_string(ss.as_string());
5046 }
5047 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5048 pushptr(buffer.addr(), rscratch1);
5049
5050 // call indirectly to solve generation ordering problem
5051 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5052 call(rax);
5053 // Caller pops the arguments (addr, message) and restores rax, r10.
5054 }
5055
5056 void MacroAssembler::verify_tlab() {
5057 #ifdef ASSERT
5058 if (UseTLAB && VerifyOops) {
5059 Label next, ok;
5060 Register t1 = rsi;
5061
5062 push(t1);
5063
5064 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5065 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_start_offset())));
5066 jcc(Assembler::aboveEqual, next);
5067 STOP("assert(top >= start)");
5068 should_not_reach_here();
5069
5070 bind(next);
5071 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_end_offset())));
5072 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5073 jcc(Assembler::aboveEqual, ok);
5074 STOP("assert(top <= end)");
5075 should_not_reach_here();
5076
5077 bind(ok);
5078 pop(t1);
5079 }
5080 #endif
5081 }
5082
5083 class ControlWord {
5084 public:
5085 int32_t _value;
5086
5087 int rounding_control() const { return (_value >> 10) & 3 ; }
5088 int precision_control() const { return (_value >> 8) & 3 ; }
5089 bool precision() const { return ((_value >> 5) & 1) != 0; }
5090 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5091 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5092 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5093 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5094 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5095
5096 void print() const {
5097 // rounding control
5098 const char* rc;
5099 switch (rounding_control()) {
5100 case 0: rc = "round near"; break;
5101 case 1: rc = "round down"; break;
5102 case 2: rc = "round up "; break;
5103 case 3: rc = "chop "; break;
5104 default:
5105 rc = nullptr; // silence compiler warnings
5106 fatal("Unknown rounding control: %d", rounding_control());
5107 };
5108 // precision control
5109 const char* pc;
5110 switch (precision_control()) {
5111 case 0: pc = "24 bits "; break;
5112 case 1: pc = "reserved"; break;
5113 case 2: pc = "53 bits "; break;
5114 case 3: pc = "64 bits "; break;
5115 default:
5116 pc = nullptr; // silence compiler warnings
5117 fatal("Unknown precision control: %d", precision_control());
5118 };
5119 // flags
5120 char f[9];
5121 f[0] = ' ';
5122 f[1] = ' ';
5123 f[2] = (precision ()) ? 'P' : 'p';
5124 f[3] = (underflow ()) ? 'U' : 'u';
5125 f[4] = (overflow ()) ? 'O' : 'o';
5126 f[5] = (zero_divide ()) ? 'Z' : 'z';
5127 f[6] = (denormalized()) ? 'D' : 'd';
5128 f[7] = (invalid ()) ? 'I' : 'i';
5129 f[8] = '\x0';
5130 // output
5131 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5132 }
5133
5134 };
5135
5136 class StatusWord {
5137 public:
5138 int32_t _value;
5139
5140 bool busy() const { return ((_value >> 15) & 1) != 0; }
5141 bool C3() const { return ((_value >> 14) & 1) != 0; }
5142 bool C2() const { return ((_value >> 10) & 1) != 0; }
5143 bool C1() const { return ((_value >> 9) & 1) != 0; }
5144 bool C0() const { return ((_value >> 8) & 1) != 0; }
5145 int top() const { return (_value >> 11) & 7 ; }
5146 bool error_status() const { return ((_value >> 7) & 1) != 0; }
5147 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
5148 bool precision() const { return ((_value >> 5) & 1) != 0; }
5149 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5150 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5151 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5152 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5153 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5154
5155 void print() const {
5156 // condition codes
5157 char c[5];
5158 c[0] = (C3()) ? '3' : '-';
5159 c[1] = (C2()) ? '2' : '-';
5160 c[2] = (C1()) ? '1' : '-';
5161 c[3] = (C0()) ? '0' : '-';
5162 c[4] = '\x0';
5163 // flags
5164 char f[9];
5165 f[0] = (error_status()) ? 'E' : '-';
5166 f[1] = (stack_fault ()) ? 'S' : '-';
5167 f[2] = (precision ()) ? 'P' : '-';
5168 f[3] = (underflow ()) ? 'U' : '-';
5169 f[4] = (overflow ()) ? 'O' : '-';
5170 f[5] = (zero_divide ()) ? 'Z' : '-';
5171 f[6] = (denormalized()) ? 'D' : '-';
5172 f[7] = (invalid ()) ? 'I' : '-';
5173 f[8] = '\x0';
5174 // output
5175 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
5176 }
5177
5178 };
5179
5180 class TagWord {
5181 public:
5182 int32_t _value;
5183
5184 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
5185
5186 void print() const {
5187 printf("%04x", _value & 0xFFFF);
5188 }
5189
5190 };
5191
5192 class FPU_Register {
5193 public:
5194 int32_t _m0;
5195 int32_t _m1;
5196 int16_t _ex;
5197
5198 bool is_indefinite() const {
5199 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5200 }
5201
5202 void print() const {
5203 char sign = (_ex < 0) ? '-' : '+';
5204 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
5205 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
5206 };
5207
5208 };
5209
5210 class FPU_State {
5211 public:
5212 enum {
5213 register_size = 10,
5214 number_of_registers = 8,
5215 register_mask = 7
5216 };
5217
5218 ControlWord _control_word;
5219 StatusWord _status_word;
5220 TagWord _tag_word;
5221 int32_t _error_offset;
5222 int32_t _error_selector;
5223 int32_t _data_offset;
5224 int32_t _data_selector;
5225 int8_t _register[register_size * number_of_registers];
5226
5227 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5228 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
5229
5230 const char* tag_as_string(int tag) const {
5231 switch (tag) {
5232 case 0: return "valid";
5233 case 1: return "zero";
5234 case 2: return "special";
5235 case 3: return "empty";
5236 }
5237 ShouldNotReachHere();
5238 return nullptr;
5239 }
5240
5241 void print() const {
5242 // print computation registers
5243 { int t = _status_word.top();
5244 for (int i = 0; i < number_of_registers; i++) {
5245 int j = (i - t) & register_mask;
5246 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5247 st(j)->print();
5248 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5249 }
5250 }
5251 printf("\n");
5252 // print control registers
5253 printf("ctrl = "); _control_word.print(); printf("\n");
5254 printf("stat = "); _status_word .print(); printf("\n");
5255 printf("tags = "); _tag_word .print(); printf("\n");
5256 }
5257
5258 };
5259
5260 class Flag_Register {
5261 public:
5262 int32_t _value;
5263
5264 bool overflow() const { return ((_value >> 11) & 1) != 0; }
5265 bool direction() const { return ((_value >> 10) & 1) != 0; }
5266 bool sign() const { return ((_value >> 7) & 1) != 0; }
5267 bool zero() const { return ((_value >> 6) & 1) != 0; }
5268 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
5269 bool parity() const { return ((_value >> 2) & 1) != 0; }
5270 bool carry() const { return ((_value >> 0) & 1) != 0; }
5271
5272 void print() const {
5273 // flags
5274 char f[8];
5275 f[0] = (overflow ()) ? 'O' : '-';
5276 f[1] = (direction ()) ? 'D' : '-';
5277 f[2] = (sign ()) ? 'S' : '-';
5278 f[3] = (zero ()) ? 'Z' : '-';
5279 f[4] = (auxiliary_carry()) ? 'A' : '-';
5280 f[5] = (parity ()) ? 'P' : '-';
5281 f[6] = (carry ()) ? 'C' : '-';
5282 f[7] = '\x0';
5283 // output
5284 printf("%08x flags = %s", _value, f);
5285 }
5286
5287 };
5288
5289 class IU_Register {
5290 public:
5291 int32_t _value;
5292
5293 void print() const {
5294 printf("%08x %11d", _value, _value);
5295 }
5296
5297 };
5298
5299 class IU_State {
5300 public:
5301 Flag_Register _eflags;
5302 IU_Register _rdi;
5303 IU_Register _rsi;
5304 IU_Register _rbp;
5305 IU_Register _rsp;
5306 IU_Register _rbx;
5307 IU_Register _rdx;
5308 IU_Register _rcx;
5309 IU_Register _rax;
5310
5311 void print() const {
5312 // computation registers
5313 printf("rax, = "); _rax.print(); printf("\n");
5314 printf("rbx, = "); _rbx.print(); printf("\n");
5315 printf("rcx = "); _rcx.print(); printf("\n");
5316 printf("rdx = "); _rdx.print(); printf("\n");
5317 printf("rdi = "); _rdi.print(); printf("\n");
5318 printf("rsi = "); _rsi.print(); printf("\n");
5319 printf("rbp, = "); _rbp.print(); printf("\n");
5320 printf("rsp = "); _rsp.print(); printf("\n");
5321 printf("\n");
5322 // control registers
5323 printf("flgs = "); _eflags.print(); printf("\n");
5324 }
5325 };
5326
5327
5328 class CPU_State {
5329 public:
5330 FPU_State _fpu_state;
5331 IU_State _iu_state;
5332
5333 void print() const {
5334 printf("--------------------------------------------------\n");
5335 _iu_state .print();
5336 printf("\n");
5337 _fpu_state.print();
5338 printf("--------------------------------------------------\n");
5339 }
5340
5341 };
5342
5343
5344 static void _print_CPU_state(CPU_State* state) {
5345 state->print();
5346 };
5347
5348
5349 void MacroAssembler::print_CPU_state() {
5350 push_CPU_state();
5351 push(rsp); // pass CPU state
5352 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5353 addptr(rsp, wordSize); // discard argument
5354 pop_CPU_state();
5355 }
5356
5357 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5358 // Either restore the MXCSR register after returning from the JNI Call
5359 // or verify that it wasn't changed (with -Xcheck:jni flag).
5360 if (VM_Version::supports_sse()) {
5361 if (RestoreMXCSROnJNICalls) {
5362 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5363 } else if (CheckJNICalls) {
5364 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5365 }
5366 }
5367 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5368 vzeroupper();
5369 }
5370
5371 // ((OopHandle)result).resolve();
5372 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5373 assert_different_registers(result, tmp);
5374
5375 // Only 64 bit platforms support GCs that require a tmp register
5376 // Only IN_HEAP loads require a thread_tmp register
5377 // OopHandle::resolve is an indirection like jobject.
5378 access_load_at(T_OBJECT, IN_NATIVE,
5379 result, Address(result, 0), tmp);
5380 }
5381
5382 // ((WeakHandle)result).resolve();
5383 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5384 assert_different_registers(rresult, rtmp);
5385 Label resolved;
5386
5387 // A null weak handle resolves to null.
5388 cmpptr(rresult, 0);
5389 jcc(Assembler::equal, resolved);
5390
5391 // Only 64 bit platforms support GCs that require a tmp register
5392 // Only IN_HEAP loads require a thread_tmp register
5393 // WeakHandle::resolve is an indirection like jweak.
5394 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5395 rresult, Address(rresult, 0), rtmp);
5396 bind(resolved);
5397 }
5398
5399 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5400 // get mirror
5401 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5402 load_method_holder(mirror, method);
5403 movptr(mirror, Address(mirror, mirror_offset));
5404 resolve_oop_handle(mirror, tmp);
5405 }
5406
5407 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5408 load_method_holder(rresult, rmethod);
5409 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5410 }
5411
5412 void MacroAssembler::load_method_holder(Register holder, Register method) {
5413 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
5414 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5415 movptr(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5416 }
5417
5418 void MacroAssembler::load_metadata(Register dst, Register src) {
5419 if (UseCompactObjectHeaders) {
5420 load_narrow_klass_compact(dst, src);
5421 } else if (UseCompressedClassPointers) {
5422 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5423 } else {
5424 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5425 }
5426 }
5427
5428 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
5429 assert(UseCompactObjectHeaders, "expect compact object headers");
5430 movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
5431 shrq(dst, markWord::klass_shift);
5432 }
5433
5434 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5435 assert_different_registers(src, tmp);
5436 assert_different_registers(dst, tmp);
5437
5438 if (UseCompactObjectHeaders) {
5439 load_narrow_klass_compact(dst, src);
5440 decode_klass_not_null(dst, tmp);
5441 } else if (UseCompressedClassPointers) {
5442 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5443 decode_klass_not_null(dst, tmp);
5444 } else {
5445 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5446 }
5447 }
5448
5449 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
5450 load_klass(dst, src, tmp);
5451 movptr(dst, Address(dst, Klass::prototype_header_offset()));
5452 }
5453
5454 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5455 assert(!UseCompactObjectHeaders, "not with compact headers");
5456 assert_different_registers(src, tmp);
5457 assert_different_registers(dst, tmp);
5458 if (UseCompressedClassPointers) {
5459 encode_klass_not_null(src, tmp);
5460 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5461 } else {
5462 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5463 }
5464 }
5465
5466 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) {
5467 if (UseCompactObjectHeaders) {
5468 assert(tmp != noreg, "need tmp");
5469 assert_different_registers(klass, obj, tmp);
5470 load_narrow_klass_compact(tmp, obj);
5471 cmpl(klass, tmp);
5472 } else if (UseCompressedClassPointers) {
5473 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
5474 } else {
5475 cmpptr(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
5476 }
5477 }
5478
5479 void MacroAssembler::cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2) {
5480 if (UseCompactObjectHeaders) {
5481 assert(tmp2 != noreg, "need tmp2");
5482 assert_different_registers(obj1, obj2, tmp1, tmp2);
5483 load_narrow_klass_compact(tmp1, obj1);
5484 load_narrow_klass_compact(tmp2, obj2);
5485 cmpl(tmp1, tmp2);
5486 } else if (UseCompressedClassPointers) {
5487 movl(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes()));
5488 cmpl(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes()));
5489 } else {
5490 movptr(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes()));
5491 cmpptr(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes()));
5492 }
5493 }
5494
5495 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5496 Register tmp1) {
5497 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5498 decorators = AccessInternal::decorator_fixup(decorators, type);
5499 bool as_raw = (decorators & AS_RAW) != 0;
5500 if (as_raw) {
5501 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1);
5502 } else {
5503 bs->load_at(this, decorators, type, dst, src, tmp1);
5504 }
5505 }
5506
5507 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5508 Register tmp1, Register tmp2, Register tmp3) {
5509 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5510 decorators = AccessInternal::decorator_fixup(decorators, type);
5511 bool as_raw = (decorators & AS_RAW) != 0;
5512 if (as_raw) {
5513 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5514 } else {
5515 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5516 }
5517 }
5518
5519 void MacroAssembler::flat_field_copy(DecoratorSet decorators, Register src, Register dst,
5520 Register inline_layout_info) {
5521 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5522 bs->flat_field_copy(this, decorators, src, dst, inline_layout_info);
5523 }
5524
5525 void MacroAssembler::payload_offset(Register inline_klass, Register offset) {
5526 movptr(offset, Address(inline_klass, InstanceKlass::adr_inlineklass_fixed_block_offset()));
5527 movl(offset, Address(offset, InlineKlass::payload_offset_offset()));
5528 }
5529
5530 void MacroAssembler::payload_addr(Register oop, Register data, Register inline_klass) {
5531 // ((address) (void*) o) + vk->payload_offset();
5532 Register offset = (data == oop) ? rscratch1 : data;
5533 payload_offset(inline_klass, offset);
5534 if (data == oop) {
5535 addptr(data, offset);
5536 } else {
5537 lea(data, Address(oop, offset));
5538 }
5539 }
5540
5541 void MacroAssembler::data_for_value_array_index(Register array, Register array_klass,
5542 Register index, Register data) {
5543 assert(index != rcx, "index needs to shift by rcx");
5544 assert_different_registers(array, array_klass, index);
5545 assert_different_registers(rcx, array, index);
5546
5547 // array->base() + (index << Klass::layout_helper_log2_element_size(lh));
5548 movl(rcx, Address(array_klass, Klass::layout_helper_offset()));
5549
5550 // Klass::layout_helper_log2_element_size(lh)
5551 // (lh >> _lh_log2_element_size_shift) & _lh_log2_element_size_mask;
5552 shrl(rcx, Klass::_lh_log2_element_size_shift);
5553 andl(rcx, Klass::_lh_log2_element_size_mask);
5554 shlptr(index); // index << rcx
5555
5556 lea(data, Address(array, index, Address::times_1, arrayOopDesc::base_offset_in_bytes(T_FLAT_ELEMENT)));
5557 }
5558
5559 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5560 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1);
5561 }
5562
5563 // Doesn't do verification, generates fixed size code
5564 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5565 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1);
5566 }
5567
5568 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5569 Register tmp2, Register tmp3, DecoratorSet decorators) {
5570 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5571 }
5572
5573 // Used for storing nulls.
5574 void MacroAssembler::store_heap_oop_null(Address dst) {
5575 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5576 }
5577
5578 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5579 assert(!UseCompactObjectHeaders, "Don't use with compact headers");
5580 if (UseCompressedClassPointers) {
5581 // Store to klass gap in destination
5582 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5583 }
5584 }
5585
5586 #ifdef ASSERT
5587 void MacroAssembler::verify_heapbase(const char* msg) {
5588 assert (UseCompressedOops, "should be compressed");
5589 assert (Universe::heap() != nullptr, "java heap should be initialized");
5590 if (CheckCompressedOops) {
5591 Label ok;
5592 ExternalAddress src2(CompressedOops::base_addr());
5593 const bool is_src2_reachable = reachable(src2);
5594 if (!is_src2_reachable) {
5595 push(rscratch1); // cmpptr trashes rscratch1
5596 }
5597 cmpptr(r12_heapbase, src2, rscratch1);
5598 jcc(Assembler::equal, ok);
5599 STOP(msg);
5600 bind(ok);
5601 if (!is_src2_reachable) {
5602 pop(rscratch1);
5603 }
5604 }
5605 }
5606 #endif
5607
5608 // Algorithm must match oop.inline.hpp encode_heap_oop.
5609 void MacroAssembler::encode_heap_oop(Register r) {
5610 #ifdef ASSERT
5611 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5612 #endif
5613 verify_oop_msg(r, "broken oop in encode_heap_oop");
5614 if (CompressedOops::base() == nullptr) {
5615 if (CompressedOops::shift() != 0) {
5616 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5617 shrq(r, LogMinObjAlignmentInBytes);
5618 }
5619 return;
5620 }
5621 testq(r, r);
5622 cmovq(Assembler::equal, r, r12_heapbase);
5623 subq(r, r12_heapbase);
5624 shrq(r, LogMinObjAlignmentInBytes);
5625 }
5626
5627 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5628 #ifdef ASSERT
5629 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5630 if (CheckCompressedOops) {
5631 Label ok;
5632 testq(r, r);
5633 jcc(Assembler::notEqual, ok);
5634 STOP("null oop passed to encode_heap_oop_not_null");
5635 bind(ok);
5636 }
5637 #endif
5638 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5639 if (CompressedOops::base() != nullptr) {
5640 subq(r, r12_heapbase);
5641 }
5642 if (CompressedOops::shift() != 0) {
5643 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5644 shrq(r, LogMinObjAlignmentInBytes);
5645 }
5646 }
5647
5648 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5649 #ifdef ASSERT
5650 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5651 if (CheckCompressedOops) {
5652 Label ok;
5653 testq(src, src);
5654 jcc(Assembler::notEqual, ok);
5655 STOP("null oop passed to encode_heap_oop_not_null2");
5656 bind(ok);
5657 }
5658 #endif
5659 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5660 if (dst != src) {
5661 movq(dst, src);
5662 }
5663 if (CompressedOops::base() != nullptr) {
5664 subq(dst, r12_heapbase);
5665 }
5666 if (CompressedOops::shift() != 0) {
5667 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5668 shrq(dst, LogMinObjAlignmentInBytes);
5669 }
5670 }
5671
5672 void MacroAssembler::decode_heap_oop(Register r) {
5673 #ifdef ASSERT
5674 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5675 #endif
5676 if (CompressedOops::base() == nullptr) {
5677 if (CompressedOops::shift() != 0) {
5678 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5679 shlq(r, LogMinObjAlignmentInBytes);
5680 }
5681 } else {
5682 Label done;
5683 shlq(r, LogMinObjAlignmentInBytes);
5684 jccb(Assembler::equal, done);
5685 addq(r, r12_heapbase);
5686 bind(done);
5687 }
5688 verify_oop_msg(r, "broken oop in decode_heap_oop");
5689 }
5690
5691 void MacroAssembler::decode_heap_oop_not_null(Register r) {
5692 // Note: it will change flags
5693 assert (UseCompressedOops, "should only be used for compressed headers");
5694 assert (Universe::heap() != nullptr, "java heap should be initialized");
5695 // Cannot assert, unverified entry point counts instructions (see .ad file)
5696 // vtableStubs also counts instructions in pd_code_size_limit.
5697 // Also do not verify_oop as this is called by verify_oop.
5698 if (CompressedOops::shift() != 0) {
5699 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5700 shlq(r, LogMinObjAlignmentInBytes);
5701 if (CompressedOops::base() != nullptr) {
5702 addq(r, r12_heapbase);
5703 }
5704 } else {
5705 assert (CompressedOops::base() == nullptr, "sanity");
5706 }
5707 }
5708
5709 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5710 // Note: it will change flags
5711 assert (UseCompressedOops, "should only be used for compressed headers");
5712 assert (Universe::heap() != nullptr, "java heap should be initialized");
5713 // Cannot assert, unverified entry point counts instructions (see .ad file)
5714 // vtableStubs also counts instructions in pd_code_size_limit.
5715 // Also do not verify_oop as this is called by verify_oop.
5716 if (CompressedOops::shift() != 0) {
5717 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5718 if (LogMinObjAlignmentInBytes == Address::times_8) {
5719 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5720 } else {
5721 if (dst != src) {
5722 movq(dst, src);
5723 }
5724 shlq(dst, LogMinObjAlignmentInBytes);
5725 if (CompressedOops::base() != nullptr) {
5726 addq(dst, r12_heapbase);
5727 }
5728 }
5729 } else {
5730 assert (CompressedOops::base() == nullptr, "sanity");
5731 if (dst != src) {
5732 movq(dst, src);
5733 }
5734 }
5735 }
5736
5737 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5738 BLOCK_COMMENT("encode_klass_not_null {");
5739 assert_different_registers(r, tmp);
5740 if (CompressedKlassPointers::base() != nullptr) {
5741 if (AOTCodeCache::is_on_for_dump()) {
5742 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5743 } else {
5744 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5745 }
5746 subq(r, tmp);
5747 }
5748 if (CompressedKlassPointers::shift() != 0) {
5749 shrq(r, CompressedKlassPointers::shift());
5750 }
5751 BLOCK_COMMENT("} encode_klass_not_null");
5752 }
5753
5754 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5755 BLOCK_COMMENT("encode_and_move_klass_not_null {");
5756 assert_different_registers(src, dst);
5757 if (CompressedKlassPointers::base() != nullptr) {
5758 movptr(dst, -(intptr_t)CompressedKlassPointers::base());
5759 addq(dst, src);
5760 } else {
5761 movptr(dst, src);
5762 }
5763 if (CompressedKlassPointers::shift() != 0) {
5764 shrq(dst, CompressedKlassPointers::shift());
5765 }
5766 BLOCK_COMMENT("} encode_and_move_klass_not_null");
5767 }
5768
5769 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5770 BLOCK_COMMENT("decode_klass_not_null {");
5771 assert_different_registers(r, tmp);
5772 // Note: it will change flags
5773 assert(UseCompressedClassPointers, "should only be used for compressed headers");
5774 // Cannot assert, unverified entry point counts instructions (see .ad file)
5775 // vtableStubs also counts instructions in pd_code_size_limit.
5776 // Also do not verify_oop as this is called by verify_oop.
5777 if (CompressedKlassPointers::shift() != 0) {
5778 shlq(r, CompressedKlassPointers::shift());
5779 }
5780 if (CompressedKlassPointers::base() != nullptr) {
5781 if (AOTCodeCache::is_on_for_dump()) {
5782 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5783 } else {
5784 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5785 }
5786 addq(r, tmp);
5787 }
5788 BLOCK_COMMENT("} decode_klass_not_null");
5789 }
5790
5791 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5792 BLOCK_COMMENT("decode_and_move_klass_not_null {");
5793 assert_different_registers(src, dst);
5794 // Note: it will change flags
5795 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5796 // Cannot assert, unverified entry point counts instructions (see .ad file)
5797 // vtableStubs also counts instructions in pd_code_size_limit.
5798 // Also do not verify_oop as this is called by verify_oop.
5799
5800 if (CompressedKlassPointers::base() == nullptr &&
5801 CompressedKlassPointers::shift() == 0) {
5802 // The best case scenario is that there is no base or shift. Then it is already
5803 // a pointer that needs nothing but a register rename.
5804 movl(dst, src);
5805 } else {
5806 if (CompressedKlassPointers::shift() <= Address::times_8) {
5807 if (CompressedKlassPointers::base() != nullptr) {
5808 movptr(dst, (intptr_t)CompressedKlassPointers::base());
5809 } else {
5810 xorq(dst, dst);
5811 }
5812 if (CompressedKlassPointers::shift() != 0) {
5813 assert(CompressedKlassPointers::shift() == Address::times_8, "klass not aligned on 64bits?");
5814 leaq(dst, Address(dst, src, Address::times_8, 0));
5815 } else {
5816 addq(dst, src);
5817 }
5818 } else {
5819 if (CompressedKlassPointers::base() != nullptr) {
5820 const intptr_t base_right_shifted =
5821 (intptr_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
5822 movptr(dst, base_right_shifted);
5823 } else {
5824 xorq(dst, dst);
5825 }
5826 addq(dst, src);
5827 shlq(dst, CompressedKlassPointers::shift());
5828 }
5829 }
5830 BLOCK_COMMENT("} decode_and_move_klass_not_null");
5831 }
5832
5833 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5834 assert (UseCompressedOops, "should only be used for compressed headers");
5835 assert (Universe::heap() != nullptr, "java heap should be initialized");
5836 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5837 int oop_index = oop_recorder()->find_index(obj);
5838 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5839 mov_narrow_oop(dst, oop_index, rspec);
5840 }
5841
5842 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5843 assert (UseCompressedOops, "should only be used for compressed headers");
5844 assert (Universe::heap() != nullptr, "java heap should be initialized");
5845 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5846 int oop_index = oop_recorder()->find_index(obj);
5847 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5848 mov_narrow_oop(dst, oop_index, rspec);
5849 }
5850
5851 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5852 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5853 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5854 int klass_index = oop_recorder()->find_index(k);
5855 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5856 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5857 }
5858
5859 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5860 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5861 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5862 int klass_index = oop_recorder()->find_index(k);
5863 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5864 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5865 }
5866
5867 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5868 assert (UseCompressedOops, "should only be used for compressed headers");
5869 assert (Universe::heap() != nullptr, "java heap should be initialized");
5870 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5871 int oop_index = oop_recorder()->find_index(obj);
5872 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5873 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5874 }
5875
5876 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5877 assert (UseCompressedOops, "should only be used for compressed headers");
5878 assert (Universe::heap() != nullptr, "java heap should be initialized");
5879 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5880 int oop_index = oop_recorder()->find_index(obj);
5881 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5882 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5883 }
5884
5885 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5886 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5887 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5888 int klass_index = oop_recorder()->find_index(k);
5889 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5890 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5891 }
5892
5893 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5894 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5895 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5896 int klass_index = oop_recorder()->find_index(k);
5897 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5898 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5899 }
5900
5901 void MacroAssembler::reinit_heapbase() {
5902 if (UseCompressedOops) {
5903 if (Universe::heap() != nullptr) {
5904 if (CompressedOops::base() == nullptr) {
5905 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5906 } else {
5907 mov64(r12_heapbase, (int64_t)CompressedOops::base());
5908 }
5909 } else {
5910 movptr(r12_heapbase, ExternalAddress(CompressedOops::base_addr()));
5911 }
5912 }
5913 }
5914
5915 #if COMPILER2_OR_JVMCI
5916
5917 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5918 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, KRegister mask) {
5919 // cnt - number of qwords (8-byte words).
5920 // base - start address, qword aligned.
5921 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5922 bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
5923 if (use64byteVector) {
5924 evpbroadcastq(xtmp, val, AVX_512bit);
5925 } else if (MaxVectorSize >= 32) {
5926 movdq(xtmp, val);
5927 punpcklqdq(xtmp, xtmp);
5928 vinserti128_high(xtmp, xtmp);
5929 } else {
5930 movdq(xtmp, val);
5931 punpcklqdq(xtmp, xtmp);
5932 }
5933 jmp(L_zero_64_bytes);
5934
5935 BIND(L_loop);
5936 if (MaxVectorSize >= 32) {
5937 fill64(base, 0, xtmp, use64byteVector);
5938 } else {
5939 movdqu(Address(base, 0), xtmp);
5940 movdqu(Address(base, 16), xtmp);
5941 movdqu(Address(base, 32), xtmp);
5942 movdqu(Address(base, 48), xtmp);
5943 }
5944 addptr(base, 64);
5945
5946 BIND(L_zero_64_bytes);
5947 subptr(cnt, 8);
5948 jccb(Assembler::greaterEqual, L_loop);
5949
5950 // Copy trailing 64 bytes
5951 if (use64byteVector) {
5952 addptr(cnt, 8);
5953 jccb(Assembler::equal, L_end);
5954 fill64_masked(3, base, 0, xtmp, mask, cnt, val, true);
5955 jmp(L_end);
5956 } else {
5957 addptr(cnt, 4);
5958 jccb(Assembler::less, L_tail);
5959 if (MaxVectorSize >= 32) {
5960 vmovdqu(Address(base, 0), xtmp);
5961 } else {
5962 movdqu(Address(base, 0), xtmp);
5963 movdqu(Address(base, 16), xtmp);
5964 }
5965 }
5966 addptr(base, 32);
5967 subptr(cnt, 4);
5968
5969 BIND(L_tail);
5970 addptr(cnt, 4);
5971 jccb(Assembler::lessEqual, L_end);
5972 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5973 fill32_masked(3, base, 0, xtmp, mask, cnt, val);
5974 } else {
5975 decrement(cnt);
5976
5977 BIND(L_sloop);
5978 movq(Address(base, 0), xtmp);
5979 addptr(base, 8);
5980 decrement(cnt);
5981 jccb(Assembler::greaterEqual, L_sloop);
5982 }
5983 BIND(L_end);
5984 }
5985
5986 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) {
5987 assert(InlineTypeReturnedAsFields, "Inline types should never be returned as fields");
5988 // An inline type might be returned. If fields are in registers we
5989 // need to allocate an inline type instance and initialize it with
5990 // the value of the fields.
5991 Label skip;
5992 // We only need a new buffered inline type if a new one is not returned
5993 testptr(rax, 1);
5994 jcc(Assembler::zero, skip);
5995 int call_offset = -1;
5996
5997 #ifdef _LP64
5998 // The following code is similar to allocate_instance but has some slight differences,
5999 // e.g. object size is always not zero, sometimes it's constant; storing klass ptr after
6000 // allocating is not necessary if vk != nullptr, etc. allocate_instance is not aware of these.
6001 Label slow_case;
6002 // 1. Try to allocate a new buffered inline instance either from TLAB or eden space
6003 mov(rscratch1, rax); // save rax for slow_case since *_allocate may corrupt it when allocation failed
6004 if (vk != nullptr) {
6005 // Called from C1, where the return type is statically known.
6006 movptr(rbx, (intptr_t)vk->get_InlineKlass());
6007 jint lh = vk->layout_helper();
6008 assert(lh != Klass::_lh_neutral_value, "inline class in return type must have been resolved");
6009 if (UseTLAB && !Klass::layout_helper_needs_slow_path(lh)) {
6010 tlab_allocate(rax, noreg, lh, r13, r14, slow_case);
6011 } else {
6012 jmp(slow_case);
6013 }
6014 } else {
6015 // Call from interpreter. RAX contains ((the InlineKlass* of the return type) | 0x01)
6016 mov(rbx, rax);
6017 andptr(rbx, -2);
6018 if (UseTLAB) {
6019 movl(r14, Address(rbx, Klass::layout_helper_offset()));
6020 testl(r14, Klass::_lh_instance_slow_path_bit);
6021 jcc(Assembler::notZero, slow_case);
6022 tlab_allocate(rax, r14, 0, r13, r14, slow_case);
6023 } else {
6024 jmp(slow_case);
6025 }
6026 }
6027 if (UseTLAB) {
6028 // 2. Initialize buffered inline instance header
6029 Register buffer_obj = rax;
6030 Register klass = rbx;
6031 if (UseCompactObjectHeaders) {
6032 Register mark_word = r13;
6033 movptr(mark_word, Address(klass, Klass::prototype_header_offset()));
6034 movptr(Address(buffer_obj, oopDesc::mark_offset_in_bytes()), mark_word);
6035 } else {
6036 movptr(Address(buffer_obj, oopDesc::mark_offset_in_bytes()), (intptr_t)markWord::inline_type_prototype().value());
6037 xorl(r13, r13);
6038 store_klass_gap(buffer_obj, r13);
6039 if (vk == nullptr) {
6040 // store_klass corrupts rbx(klass), so save it in r13 for later use (interpreter case only).
6041 mov(r13, klass);
6042 }
6043 store_klass(buffer_obj, klass, rscratch1);
6044 klass = r13;
6045 }
6046 // 3. Initialize its fields with an inline class specific handler
6047 if (vk != nullptr) {
6048 call(RuntimeAddress(vk->pack_handler())); // no need for call info as this will not safepoint.
6049 } else {
6050 movptr(rbx, Address(klass, InstanceKlass::adr_inlineklass_fixed_block_offset()));
6051 movptr(rbx, Address(rbx, InlineKlass::pack_handler_offset()));
6052 call(rbx);
6053 }
6054 jmp(skip);
6055 }
6056 bind(slow_case);
6057 // We failed to allocate a new inline type, fall back to a runtime
6058 // call. Some oop field may be live in some registers but we can't
6059 // tell. That runtime call will take care of preserving them
6060 // across a GC if there's one.
6061 mov(rax, rscratch1);
6062 #endif
6063
6064 if (from_interpreter) {
6065 super_call_VM_leaf(StubRoutines::store_inline_type_fields_to_buf());
6066 } else {
6067 call(RuntimeAddress(StubRoutines::store_inline_type_fields_to_buf()));
6068 call_offset = offset();
6069 }
6070
6071 bind(skip);
6072 return call_offset;
6073 }
6074
6075 // Move a value between registers/stack slots and update the reg_state
6076 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
6077 assert(from->is_valid() && to->is_valid(), "source and destination must be valid");
6078 if (reg_state[to->value()] == reg_written) {
6079 return true; // Already written
6080 }
6081 if (from != to && bt != T_VOID) {
6082 if (reg_state[to->value()] == reg_readonly) {
6083 return false; // Not yet writable
6084 }
6085 if (from->is_reg()) {
6086 if (to->is_reg()) {
6087 if (from->is_XMMRegister()) {
6088 if (bt == T_DOUBLE) {
6089 movdbl(to->as_XMMRegister(), from->as_XMMRegister());
6090 } else {
6091 assert(bt == T_FLOAT, "must be float");
6092 movflt(to->as_XMMRegister(), from->as_XMMRegister());
6093 }
6094 } else {
6095 movq(to->as_Register(), from->as_Register());
6096 }
6097 } else {
6098 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6099 Address to_addr = Address(rsp, st_off);
6100 if (from->is_XMMRegister()) {
6101 if (bt == T_DOUBLE) {
6102 movdbl(to_addr, from->as_XMMRegister());
6103 } else {
6104 assert(bt == T_FLOAT, "must be float");
6105 movflt(to_addr, from->as_XMMRegister());
6106 }
6107 } else {
6108 movq(to_addr, from->as_Register());
6109 }
6110 }
6111 } else {
6112 Address from_addr = Address(rsp, from->reg2stack() * VMRegImpl::stack_slot_size + wordSize);
6113 if (to->is_reg()) {
6114 if (to->is_XMMRegister()) {
6115 if (bt == T_DOUBLE) {
6116 movdbl(to->as_XMMRegister(), from_addr);
6117 } else {
6118 assert(bt == T_FLOAT, "must be float");
6119 movflt(to->as_XMMRegister(), from_addr);
6120 }
6121 } else {
6122 movq(to->as_Register(), from_addr);
6123 }
6124 } else {
6125 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6126 movq(r13, from_addr);
6127 movq(Address(rsp, st_off), r13);
6128 }
6129 }
6130 }
6131 // Update register states
6132 reg_state[from->value()] = reg_writable;
6133 reg_state[to->value()] = reg_written;
6134 return true;
6135 }
6136
6137 // Calculate the extra stack space required for packing or unpacking inline
6138 // args and adjust the stack pointer
6139 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
6140 // Two additional slots to account for return address
6141 int sp_inc = (args_on_stack + 2) * VMRegImpl::stack_slot_size;
6142 sp_inc = align_up(sp_inc, StackAlignmentInBytes);
6143 // Save the return address, adjust the stack (make sure it is properly
6144 // 16-byte aligned) and copy the return address to the new top of the stack.
6145 // The stack will be repaired on return (see MacroAssembler::remove_frame).
6146 assert(sp_inc > 0, "sanity");
6147 pop(r13);
6148 subptr(rsp, sp_inc);
6149 push(r13);
6150 return sp_inc;
6151 }
6152
6153 // Read all fields from an inline type buffer and store the field values in registers/stack slots.
6154 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
6155 VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
6156 RegState reg_state[]) {
6157 assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter");
6158 assert(from->is_valid(), "source must be valid");
6159 bool progress = false;
6160 #ifdef ASSERT
6161 const int start_offset = offset();
6162 #endif
6163
6164 Label L_null, L_notNull;
6165 // Don't use r14 as tmp because it's used for spilling (see MacroAssembler::spill_reg_for)
6166 Register tmp1 = r10;
6167 Register tmp2 = r13;
6168 Register fromReg = noreg;
6169 ScalarizedInlineArgsStream stream(sig, sig_index, to, to_count, to_index, -1);
6170 bool done = true;
6171 bool mark_done = true;
6172 VMReg toReg;
6173 BasicType bt;
6174 // Check if argument requires a null check
6175 bool null_check = false;
6176 VMReg nullCheckReg;
6177 while (stream.next(nullCheckReg, bt)) {
6178 if (sig->at(stream.sig_index())._offset == -1) {
6179 null_check = true;
6180 break;
6181 }
6182 }
6183 stream.reset(sig_index, to_index);
6184 while (stream.next(toReg, bt)) {
6185 assert(toReg->is_valid(), "destination must be valid");
6186 int idx = (int)toReg->value();
6187 if (reg_state[idx] == reg_readonly) {
6188 if (idx != from->value()) {
6189 mark_done = false;
6190 }
6191 done = false;
6192 continue;
6193 } else if (reg_state[idx] == reg_written) {
6194 continue;
6195 }
6196 assert(reg_state[idx] == reg_writable, "must be writable");
6197 reg_state[idx] = reg_written;
6198 progress = true;
6199
6200 if (fromReg == noreg) {
6201 if (from->is_reg()) {
6202 fromReg = from->as_Register();
6203 } else {
6204 int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6205 movq(tmp1, Address(rsp, st_off));
6206 fromReg = tmp1;
6207 }
6208 if (null_check) {
6209 // Nullable inline type argument, emit null check
6210 testptr(fromReg, fromReg);
6211 jcc(Assembler::zero, L_null);
6212 }
6213 }
6214 int off = sig->at(stream.sig_index())._offset;
6215 if (off == -1) {
6216 assert(null_check, "Missing null check at");
6217 if (toReg->is_stack()) {
6218 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6219 movq(Address(rsp, st_off), 1);
6220 } else {
6221 movq(toReg->as_Register(), 1);
6222 }
6223 continue;
6224 }
6225 assert(off > 0, "offset in object should be positive");
6226 Address fromAddr = Address(fromReg, off);
6227 if (!toReg->is_XMMRegister()) {
6228 Register dst = toReg->is_stack() ? tmp2 : toReg->as_Register();
6229 if (is_reference_type(bt)) {
6230 load_heap_oop(dst, fromAddr);
6231 } else {
6232 bool is_signed = (bt != T_CHAR) && (bt != T_BOOLEAN);
6233 load_sized_value(dst, fromAddr, type2aelembytes(bt), is_signed);
6234 }
6235 if (toReg->is_stack()) {
6236 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6237 movq(Address(rsp, st_off), dst);
6238 }
6239 } else if (bt == T_DOUBLE) {
6240 movdbl(toReg->as_XMMRegister(), fromAddr);
6241 } else {
6242 assert(bt == T_FLOAT, "must be float");
6243 movflt(toReg->as_XMMRegister(), fromAddr);
6244 }
6245 }
6246 if (progress && null_check) {
6247 if (done) {
6248 jmp(L_notNull);
6249 bind(L_null);
6250 // Set null marker to zero to signal that the argument is null.
6251 // Also set all oop fields to zero to make the GC happy.
6252 stream.reset(sig_index, to_index);
6253 while (stream.next(toReg, bt)) {
6254 if (sig->at(stream.sig_index())._offset == -1 ||
6255 bt == T_OBJECT || bt == T_ARRAY) {
6256 if (toReg->is_stack()) {
6257 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6258 movq(Address(rsp, st_off), 0);
6259 } else {
6260 xorq(toReg->as_Register(), toReg->as_Register());
6261 }
6262 }
6263 }
6264 bind(L_notNull);
6265 } else {
6266 bind(L_null);
6267 }
6268 }
6269
6270 sig_index = stream.sig_index();
6271 to_index = stream.regs_index();
6272
6273 if (mark_done && reg_state[from->value()] != reg_written) {
6274 // This is okay because no one else will write to that slot
6275 reg_state[from->value()] = reg_writable;
6276 }
6277 from_index--;
6278 assert(progress || (start_offset == offset()), "should not emit code");
6279 return done;
6280 }
6281
6282 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
6283 VMRegPair* from, int from_count, int& from_index, VMReg to,
6284 RegState reg_state[], Register val_array) {
6285 assert(sig->at(sig_index)._bt == T_METADATA, "should be at delimiter");
6286 assert(to->is_valid(), "destination must be valid");
6287
6288 if (reg_state[to->value()] == reg_written) {
6289 skip_unpacked_fields(sig, sig_index, from, from_count, from_index);
6290 return true; // Already written
6291 }
6292
6293 // TODO 8284443 Isn't it an issue if below code uses r14 as tmp when it contains a spilled value?
6294 // Be careful with r14 because it's used for spilling (see MacroAssembler::spill_reg_for).
6295 Register val_obj_tmp = r11;
6296 Register from_reg_tmp = r14;
6297 Register tmp1 = r10;
6298 Register tmp2 = r13;
6299 Register tmp3 = rbx;
6300 Register val_obj = to->is_stack() ? val_obj_tmp : to->as_Register();
6301
6302 assert_different_registers(val_obj_tmp, from_reg_tmp, tmp1, tmp2, tmp3, val_array);
6303
6304 if (reg_state[to->value()] == reg_readonly) {
6305 if (!is_reg_in_unpacked_fields(sig, sig_index, to, from, from_count, from_index)) {
6306 skip_unpacked_fields(sig, sig_index, from, from_count, from_index);
6307 return false; // Not yet writable
6308 }
6309 val_obj = val_obj_tmp;
6310 }
6311
6312 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + vtarg_index * type2aelembytes(T_OBJECT);
6313 load_heap_oop(val_obj, Address(val_array, index));
6314
6315 ScalarizedInlineArgsStream stream(sig, sig_index, from, from_count, from_index);
6316 VMReg fromReg;
6317 BasicType bt;
6318 Label L_null;
6319 while (stream.next(fromReg, bt)) {
6320 assert(fromReg->is_valid(), "source must be valid");
6321 reg_state[fromReg->value()] = reg_writable;
6322
6323 int off = sig->at(stream.sig_index())._offset;
6324 if (off == -1) {
6325 // Nullable inline type argument, emit null check
6326 Label L_notNull;
6327 if (fromReg->is_stack()) {
6328 int ld_off = fromReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6329 testb(Address(rsp, ld_off), 1);
6330 } else {
6331 testb(fromReg->as_Register(), 1);
6332 }
6333 jcc(Assembler::notZero, L_notNull);
6334 movptr(val_obj, 0);
6335 jmp(L_null);
6336 bind(L_notNull);
6337 continue;
6338 }
6339
6340 assert(off > 0, "offset in object should be positive");
6341 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
6342
6343 // Pack the scalarized field into the value object.
6344 Address dst(val_obj, off);
6345 if (!fromReg->is_XMMRegister()) {
6346 Register src;
6347 if (fromReg->is_stack()) {
6348 src = from_reg_tmp;
6349 int ld_off = fromReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6350 load_sized_value(src, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
6351 } else {
6352 src = fromReg->as_Register();
6353 }
6354 assert_different_registers(dst.base(), src, tmp1, tmp2, tmp3, val_array);
6355 if (is_reference_type(bt)) {
6356 store_heap_oop(dst, src, tmp1, tmp2, tmp3, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
6357 } else {
6358 store_sized_value(dst, src, size_in_bytes);
6359 }
6360 } else if (bt == T_DOUBLE) {
6361 movdbl(dst, fromReg->as_XMMRegister());
6362 } else {
6363 assert(bt == T_FLOAT, "must be float");
6364 movflt(dst, fromReg->as_XMMRegister());
6365 }
6366 }
6367 bind(L_null);
6368 sig_index = stream.sig_index();
6369 from_index = stream.regs_index();
6370
6371 assert(reg_state[to->value()] == reg_writable, "must have already been read");
6372 bool success = move_helper(val_obj->as_VMReg(), to, T_OBJECT, reg_state);
6373 assert(success, "to register must be writeable");
6374 return true;
6375 }
6376
6377 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
6378 return reg->is_XMMRegister() ? xmm8->as_VMReg() : r14->as_VMReg();
6379 }
6380
6381 void MacroAssembler::remove_frame(int initial_framesize, bool needs_stack_repair) {
6382 assert((initial_framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
6383 if (needs_stack_repair) {
6384 // TODO 8284443 Add a comment drawing the frame like in Aarch64's version of MacroAssembler::remove_frame
6385 movq(rbp, Address(rsp, initial_framesize));
6386 // The stack increment resides just below the saved rbp
6387 addq(rsp, Address(rsp, initial_framesize - wordSize));
6388 } else {
6389 if (initial_framesize > 0) {
6390 addq(rsp, initial_framesize);
6391 }
6392 pop(rbp);
6393 }
6394 }
6395
6396 // Clearing constant sized memory using YMM/ZMM registers.
6397 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
6398 assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
6399 bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
6400
6401 int vector64_count = (cnt & (~0x7)) >> 3;
6402 cnt = cnt & 0x7;
6403 const int fill64_per_loop = 4;
6404 const int max_unrolled_fill64 = 8;
6405
6406 // 64 byte initialization loop.
6407 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
6408 int start64 = 0;
6409 if (vector64_count > max_unrolled_fill64) {
6410 Label LOOP;
6411 Register index = rtmp;
6412
6413 start64 = vector64_count - (vector64_count % fill64_per_loop);
6414
6415 movl(index, 0);
6416 BIND(LOOP);
6417 for (int i = 0; i < fill64_per_loop; i++) {
6418 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
6419 }
6420 addl(index, fill64_per_loop * 64);
6421 cmpl(index, start64 * 64);
6422 jccb(Assembler::less, LOOP);
6423 }
6424 for (int i = start64; i < vector64_count; i++) {
6425 fill64(base, i * 64, xtmp, use64byteVector);
6426 }
6427
6428 // Clear remaining 64 byte tail.
6429 int disp = vector64_count * 64;
6430 if (cnt) {
6431 switch (cnt) {
6432 case 1:
6433 movq(Address(base, disp), xtmp);
6434 break;
6435 case 2:
6436 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
6437 break;
6438 case 3:
6439 movl(rtmp, 0x7);
6440 kmovwl(mask, rtmp);
6441 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
6442 break;
6443 case 4:
6444 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6445 break;
6446 case 5:
6447 if (use64byteVector) {
6448 movl(rtmp, 0x1F);
6449 kmovwl(mask, rtmp);
6450 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6451 } else {
6452 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6453 movq(Address(base, disp + 32), xtmp);
6454 }
6455 break;
6456 case 6:
6457 if (use64byteVector) {
6458 movl(rtmp, 0x3F);
6459 kmovwl(mask, rtmp);
6460 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6461 } else {
6462 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6463 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
6464 }
6465 break;
6466 case 7:
6467 if (use64byteVector) {
6468 movl(rtmp, 0x7F);
6469 kmovwl(mask, rtmp);
6470 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6471 } else {
6472 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6473 movl(rtmp, 0x7);
6474 kmovwl(mask, rtmp);
6475 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
6476 }
6477 break;
6478 default:
6479 fatal("Unexpected length : %d\n",cnt);
6480 break;
6481 }
6482 }
6483 }
6484
6485 void MacroAssembler::clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp,
6486 bool is_large, bool word_copy_only, KRegister mask) {
6487 // cnt - number of qwords (8-byte words).
6488 // base - start address, qword aligned.
6489 // is_large - if optimizers know cnt is larger than InitArrayShortSize
6490 assert(base==rdi, "base register must be edi for rep stos");
6491 assert(val==rax, "val register must be eax for rep stos");
6492 assert(cnt==rcx, "cnt register must be ecx for rep stos");
6493 assert(InitArrayShortSize % BytesPerLong == 0,
6494 "InitArrayShortSize should be the multiple of BytesPerLong");
6495
6496 Label DONE;
6497
6498 if (!is_large) {
6499 Label LOOP, LONG;
6500 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6501 jccb(Assembler::greater, LONG);
6502
6503 decrement(cnt);
6504 jccb(Assembler::negative, DONE); // Zero length
6505
6506 // Use individual pointer-sized stores for small counts:
6507 BIND(LOOP);
6508 movptr(Address(base, cnt, Address::times_ptr), val);
6509 decrement(cnt);
6510 jccb(Assembler::greaterEqual, LOOP);
6511 jmpb(DONE);
6512
6513 BIND(LONG);
6514 }
6515
6516 // Use longer rep-prefixed ops for non-small counts:
6517 if (UseFastStosb && !word_copy_only) {
6518 shlptr(cnt, 3); // convert to number of bytes
6519 rep_stosb();
6520 } else if (UseXMMForObjInit) {
6521 xmm_clear_mem(base, cnt, val, xtmp, mask);
6522 } else {
6523 rep_stos();
6524 }
6525
6526 BIND(DONE);
6527 }
6528
6529 #endif //COMPILER2_OR_JVMCI
6530
6531
6532 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6533 Register to, Register value, Register count,
6534 Register rtmp, XMMRegister xtmp) {
6535 ShortBranchVerifier sbv(this);
6536 assert_different_registers(to, value, count, rtmp);
6537 Label L_exit;
6538 Label L_fill_2_bytes, L_fill_4_bytes;
6539
6540 #if defined(COMPILER2)
6541 if(MaxVectorSize >=32 &&
6542 VM_Version::supports_avx512vlbw() &&
6543 VM_Version::supports_bmi2()) {
6544 generate_fill_avx3(t, to, value, count, rtmp, xtmp);
6545 return;
6546 }
6547 #endif
6548
6549 int shift = -1;
6550 switch (t) {
6551 case T_BYTE:
6552 shift = 2;
6553 break;
6554 case T_SHORT:
6555 shift = 1;
6556 break;
6557 case T_INT:
6558 shift = 0;
6559 break;
6560 default: ShouldNotReachHere();
6561 }
6562
6563 if (t == T_BYTE) {
6564 andl(value, 0xff);
6565 movl(rtmp, value);
6566 shll(rtmp, 8);
6567 orl(value, rtmp);
6568 }
6569 if (t == T_SHORT) {
6570 andl(value, 0xffff);
6571 }
6572 if (t == T_BYTE || t == T_SHORT) {
6573 movl(rtmp, value);
6574 shll(rtmp, 16);
6575 orl(value, rtmp);
6576 }
6577
6578 cmpptr(count, 8 << shift); // Short arrays (< 32 bytes) fill by element
6579 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6580 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6581 Label L_skip_align2;
6582 // align source address at 4 bytes address boundary
6583 if (t == T_BYTE) {
6584 Label L_skip_align1;
6585 // One byte misalignment happens only for byte arrays
6586 testptr(to, 1);
6587 jccb(Assembler::zero, L_skip_align1);
6588 movb(Address(to, 0), value);
6589 increment(to);
6590 decrement(count);
6591 BIND(L_skip_align1);
6592 }
6593 // Two bytes misalignment happens only for byte and short (char) arrays
6594 testptr(to, 2);
6595 jccb(Assembler::zero, L_skip_align2);
6596 movw(Address(to, 0), value);
6597 addptr(to, 2);
6598 subptr(count, 1<<(shift-1));
6599 BIND(L_skip_align2);
6600 }
6601 {
6602 Label L_fill_32_bytes;
6603 if (!UseUnalignedLoadStores) {
6604 // align to 8 bytes, we know we are 4 byte aligned to start
6605 testptr(to, 4);
6606 jccb(Assembler::zero, L_fill_32_bytes);
6607 movl(Address(to, 0), value);
6608 addptr(to, 4);
6609 subptr(count, 1<<shift);
6610 }
6611 BIND(L_fill_32_bytes);
6612 {
6613 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6614 movdl(xtmp, value);
6615 if (UseAVX >= 2 && UseUnalignedLoadStores) {
6616 Label L_check_fill_32_bytes;
6617 if (UseAVX > 2) {
6618 // Fill 64-byte chunks
6619 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
6620
6621 // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
6622 cmpptr(count, VM_Version::avx3_threshold());
6623 jccb(Assembler::below, L_check_fill_64_bytes_avx2);
6624
6625 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
6626
6627 subptr(count, 16 << shift);
6628 jccb(Assembler::less, L_check_fill_32_bytes);
6629 align(16);
6630
6631 BIND(L_fill_64_bytes_loop_avx3);
6632 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
6633 addptr(to, 64);
6634 subptr(count, 16 << shift);
6635 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
6636 jmpb(L_check_fill_32_bytes);
6637
6638 BIND(L_check_fill_64_bytes_avx2);
6639 }
6640 // Fill 64-byte chunks
6641 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
6642
6643 subptr(count, 16 << shift);
6644 jcc(Assembler::less, L_check_fill_32_bytes);
6645
6646 // align data for 64-byte chunks
6647 Label L_fill_64_bytes_loop, L_align_64_bytes_loop;
6648 if (EnableX86ECoreOpts) {
6649 // align 'big' arrays to cache lines to minimize split_stores
6650 cmpptr(count, 96 << shift);
6651 jcc(Assembler::below, L_fill_64_bytes_loop);
6652
6653 // Find the bytes needed for alignment
6654 movptr(rtmp, to);
6655 andptr(rtmp, 0x1c);
6656 jcc(Assembler::zero, L_fill_64_bytes_loop);
6657 negptr(rtmp); // number of bytes to fill 32-rtmp. it filled by 2 mov by 32
6658 addptr(rtmp, 32);
6659 shrptr(rtmp, 2 - shift);// get number of elements from bytes
6660 subptr(count, rtmp); // adjust count by number of elements
6661
6662 align(16);
6663 BIND(L_align_64_bytes_loop);
6664 movdl(Address(to, 0), xtmp);
6665 addptr(to, 4);
6666 subptr(rtmp, 1 << shift);
6667 jcc(Assembler::greater, L_align_64_bytes_loop);
6668 }
6669
6670 align(16);
6671 BIND(L_fill_64_bytes_loop);
6672 vmovdqu(Address(to, 0), xtmp);
6673 vmovdqu(Address(to, 32), xtmp);
6674 addptr(to, 64);
6675 subptr(count, 16 << shift);
6676 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6677
6678 align(16);
6679 BIND(L_check_fill_32_bytes);
6680 addptr(count, 8 << shift);
6681 jccb(Assembler::less, L_check_fill_8_bytes);
6682 vmovdqu(Address(to, 0), xtmp);
6683 addptr(to, 32);
6684 subptr(count, 8 << shift);
6685
6686 BIND(L_check_fill_8_bytes);
6687 // clean upper bits of YMM registers
6688 movdl(xtmp, value);
6689 pshufd(xtmp, xtmp, 0);
6690 } else {
6691 // Fill 32-byte chunks
6692 pshufd(xtmp, xtmp, 0);
6693
6694 subptr(count, 8 << shift);
6695 jcc(Assembler::less, L_check_fill_8_bytes);
6696 align(16);
6697
6698 BIND(L_fill_32_bytes_loop);
6699
6700 if (UseUnalignedLoadStores) {
6701 movdqu(Address(to, 0), xtmp);
6702 movdqu(Address(to, 16), xtmp);
6703 } else {
6704 movq(Address(to, 0), xtmp);
6705 movq(Address(to, 8), xtmp);
6706 movq(Address(to, 16), xtmp);
6707 movq(Address(to, 24), xtmp);
6708 }
6709
6710 addptr(to, 32);
6711 subptr(count, 8 << shift);
6712 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6713
6714 BIND(L_check_fill_8_bytes);
6715 }
6716 addptr(count, 8 << shift);
6717 jccb(Assembler::zero, L_exit);
6718 jmpb(L_fill_8_bytes);
6719
6720 //
6721 // length is too short, just fill qwords
6722 //
6723 align(16);
6724 BIND(L_fill_8_bytes_loop);
6725 movq(Address(to, 0), xtmp);
6726 addptr(to, 8);
6727 BIND(L_fill_8_bytes);
6728 subptr(count, 1 << (shift + 1));
6729 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6730 }
6731 }
6732
6733 Label L_fill_4_bytes_loop;
6734 testl(count, 1 << shift);
6735 jccb(Assembler::zero, L_fill_2_bytes);
6736
6737 align(16);
6738 BIND(L_fill_4_bytes_loop);
6739 movl(Address(to, 0), value);
6740 addptr(to, 4);
6741
6742 BIND(L_fill_4_bytes);
6743 subptr(count, 1 << shift);
6744 jccb(Assembler::greaterEqual, L_fill_4_bytes_loop);
6745
6746 if (t == T_BYTE || t == T_SHORT) {
6747 Label L_fill_byte;
6748 BIND(L_fill_2_bytes);
6749 // fill trailing 2 bytes
6750 testl(count, 1<<(shift-1));
6751 jccb(Assembler::zero, L_fill_byte);
6752 movw(Address(to, 0), value);
6753 if (t == T_BYTE) {
6754 addptr(to, 2);
6755 BIND(L_fill_byte);
6756 // fill trailing byte
6757 testl(count, 1);
6758 jccb(Assembler::zero, L_exit);
6759 movb(Address(to, 0), value);
6760 } else {
6761 BIND(L_fill_byte);
6762 }
6763 } else {
6764 BIND(L_fill_2_bytes);
6765 }
6766 BIND(L_exit);
6767 }
6768
6769 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
6770 switch(type) {
6771 case T_BYTE:
6772 case T_BOOLEAN:
6773 evpbroadcastb(dst, src, vector_len);
6774 break;
6775 case T_SHORT:
6776 case T_CHAR:
6777 evpbroadcastw(dst, src, vector_len);
6778 break;
6779 case T_INT:
6780 case T_FLOAT:
6781 evpbroadcastd(dst, src, vector_len);
6782 break;
6783 case T_LONG:
6784 case T_DOUBLE:
6785 evpbroadcastq(dst, src, vector_len);
6786 break;
6787 default:
6788 fatal("Unhandled type : %s", type2name(type));
6789 break;
6790 }
6791 }
6792
6793 // Encode given char[]/byte[] to byte[] in ISO_8859_1 or ASCII
6794 //
6795 // @IntrinsicCandidate
6796 // int sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(
6797 // char[] sa, int sp, byte[] da, int dp, int len) {
6798 // int i = 0;
6799 // for (; i < len; i++) {
6800 // char c = sa[sp++];
6801 // if (c > '\u00FF')
6802 // break;
6803 // da[dp++] = (byte) c;
6804 // }
6805 // return i;
6806 // }
6807 //
6808 // @IntrinsicCandidate
6809 // int java.lang.StringCoding.encodeISOArray0(
6810 // byte[] sa, int sp, byte[] da, int dp, int len) {
6811 // int i = 0;
6812 // for (; i < len; i++) {
6813 // char c = StringUTF16.getChar(sa, sp++);
6814 // if (c > '\u00FF')
6815 // break;
6816 // da[dp++] = (byte) c;
6817 // }
6818 // return i;
6819 // }
6820 //
6821 // @IntrinsicCandidate
6822 // int java.lang.StringCoding.encodeAsciiArray0(
6823 // char[] sa, int sp, byte[] da, int dp, int len) {
6824 // int i = 0;
6825 // for (; i < len; i++) {
6826 // char c = sa[sp++];
6827 // if (c >= '\u0080')
6828 // break;
6829 // da[dp++] = (byte) c;
6830 // }
6831 // return i;
6832 // }
6833 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6834 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6835 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6836 Register tmp5, Register result, bool ascii) {
6837
6838 // rsi: src
6839 // rdi: dst
6840 // rdx: len
6841 // rcx: tmp5
6842 // rax: result
6843 ShortBranchVerifier sbv(this);
6844 assert_different_registers(src, dst, len, tmp5, result);
6845 Label L_done, L_copy_1_char, L_copy_1_char_exit;
6846
6847 int mask = ascii ? 0xff80ff80 : 0xff00ff00;
6848 int short_mask = ascii ? 0xff80 : 0xff00;
6849
6850 // set result
6851 xorl(result, result);
6852 // check for zero length
6853 testl(len, len);
6854 jcc(Assembler::zero, L_done);
6855
6856 movl(result, len);
6857
6858 // Setup pointers
6859 lea(src, Address(src, len, Address::times_2)); // char[]
6860 lea(dst, Address(dst, len, Address::times_1)); // byte[]
6861 negptr(len);
6862
6863 if (UseSSE42Intrinsics || UseAVX >= 2) {
6864 Label L_copy_8_chars, L_copy_8_chars_exit;
6865 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6866
6867 if (UseAVX >= 2) {
6868 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6869 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6870 movdl(tmp1Reg, tmp5);
6871 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
6872 jmp(L_chars_32_check);
6873
6874 bind(L_copy_32_chars);
6875 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6876 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6877 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6878 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6879 jccb(Assembler::notZero, L_copy_32_chars_exit);
6880 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6881 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
6882 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6883
6884 bind(L_chars_32_check);
6885 addptr(len, 32);
6886 jcc(Assembler::lessEqual, L_copy_32_chars);
6887
6888 bind(L_copy_32_chars_exit);
6889 subptr(len, 16);
6890 jccb(Assembler::greater, L_copy_16_chars_exit);
6891
6892 } else if (UseSSE42Intrinsics) {
6893 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6894 movdl(tmp1Reg, tmp5);
6895 pshufd(tmp1Reg, tmp1Reg, 0);
6896 jmpb(L_chars_16_check);
6897 }
6898
6899 bind(L_copy_16_chars);
6900 if (UseAVX >= 2) {
6901 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6902 vptest(tmp2Reg, tmp1Reg);
6903 jcc(Assembler::notZero, L_copy_16_chars_exit);
6904 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
6905 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
6906 } else {
6907 if (UseAVX > 0) {
6908 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6909 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6910 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
6911 } else {
6912 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6913 por(tmp2Reg, tmp3Reg);
6914 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6915 por(tmp2Reg, tmp4Reg);
6916 }
6917 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6918 jccb(Assembler::notZero, L_copy_16_chars_exit);
6919 packuswb(tmp3Reg, tmp4Reg);
6920 }
6921 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6922
6923 bind(L_chars_16_check);
6924 addptr(len, 16);
6925 jcc(Assembler::lessEqual, L_copy_16_chars);
6926
6927 bind(L_copy_16_chars_exit);
6928 if (UseAVX >= 2) {
6929 // clean upper bits of YMM registers
6930 vpxor(tmp2Reg, tmp2Reg);
6931 vpxor(tmp3Reg, tmp3Reg);
6932 vpxor(tmp4Reg, tmp4Reg);
6933 movdl(tmp1Reg, tmp5);
6934 pshufd(tmp1Reg, tmp1Reg, 0);
6935 }
6936 subptr(len, 8);
6937 jccb(Assembler::greater, L_copy_8_chars_exit);
6938
6939 bind(L_copy_8_chars);
6940 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6941 ptest(tmp3Reg, tmp1Reg);
6942 jccb(Assembler::notZero, L_copy_8_chars_exit);
6943 packuswb(tmp3Reg, tmp1Reg);
6944 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6945 addptr(len, 8);
6946 jccb(Assembler::lessEqual, L_copy_8_chars);
6947
6948 bind(L_copy_8_chars_exit);
6949 subptr(len, 8);
6950 jccb(Assembler::zero, L_done);
6951 }
6952
6953 bind(L_copy_1_char);
6954 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6955 testl(tmp5, short_mask); // check if Unicode or non-ASCII char
6956 jccb(Assembler::notZero, L_copy_1_char_exit);
6957 movb(Address(dst, len, Address::times_1, 0), tmp5);
6958 addptr(len, 1);
6959 jccb(Assembler::less, L_copy_1_char);
6960
6961 bind(L_copy_1_char_exit);
6962 addptr(result, len); // len is negative count of not processed elements
6963
6964 bind(L_done);
6965 }
6966
6967 /**
6968 * Helper for multiply_to_len().
6969 */
6970 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6971 addq(dest_lo, src1);
6972 adcq(dest_hi, 0);
6973 addq(dest_lo, src2);
6974 adcq(dest_hi, 0);
6975 }
6976
6977 /**
6978 * Multiply 64 bit by 64 bit first loop.
6979 */
6980 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6981 Register y, Register y_idx, Register z,
6982 Register carry, Register product,
6983 Register idx, Register kdx) {
6984 //
6985 // jlong carry, x[], y[], z[];
6986 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6987 // huge_128 product = y[idx] * x[xstart] + carry;
6988 // z[kdx] = (jlong)product;
6989 // carry = (jlong)(product >>> 64);
6990 // }
6991 // z[xstart] = carry;
6992 //
6993
6994 Label L_first_loop, L_first_loop_exit;
6995 Label L_one_x, L_one_y, L_multiply;
6996
6997 decrementl(xstart);
6998 jcc(Assembler::negative, L_one_x);
6999
7000 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7001 rorq(x_xstart, 32); // convert big-endian to little-endian
7002
7003 bind(L_first_loop);
7004 decrementl(idx);
7005 jcc(Assembler::negative, L_first_loop_exit);
7006 decrementl(idx);
7007 jcc(Assembler::negative, L_one_y);
7008 movq(y_idx, Address(y, idx, Address::times_4, 0));
7009 rorq(y_idx, 32); // convert big-endian to little-endian
7010 bind(L_multiply);
7011 movq(product, x_xstart);
7012 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7013 addq(product, carry);
7014 adcq(rdx, 0);
7015 subl(kdx, 2);
7016 movl(Address(z, kdx, Address::times_4, 4), product);
7017 shrq(product, 32);
7018 movl(Address(z, kdx, Address::times_4, 0), product);
7019 movq(carry, rdx);
7020 jmp(L_first_loop);
7021
7022 bind(L_one_y);
7023 movl(y_idx, Address(y, 0));
7024 jmp(L_multiply);
7025
7026 bind(L_one_x);
7027 movl(x_xstart, Address(x, 0));
7028 jmp(L_first_loop);
7029
7030 bind(L_first_loop_exit);
7031 }
7032
7033 /**
7034 * Multiply 64 bit by 64 bit and add 128 bit.
7035 */
7036 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7037 Register yz_idx, Register idx,
7038 Register carry, Register product, int offset) {
7039 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7040 // z[kdx] = (jlong)product;
7041
7042 movq(yz_idx, Address(y, idx, Address::times_4, offset));
7043 rorq(yz_idx, 32); // convert big-endian to little-endian
7044 movq(product, x_xstart);
7045 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7046 movq(yz_idx, Address(z, idx, Address::times_4, offset));
7047 rorq(yz_idx, 32); // convert big-endian to little-endian
7048
7049 add2_with_carry(rdx, product, carry, yz_idx);
7050
7051 movl(Address(z, idx, Address::times_4, offset+4), product);
7052 shrq(product, 32);
7053 movl(Address(z, idx, Address::times_4, offset), product);
7054
7055 }
7056
7057 /**
7058 * Multiply 128 bit by 128 bit. Unrolled inner loop.
7059 */
7060 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7061 Register yz_idx, Register idx, Register jdx,
7062 Register carry, Register product,
7063 Register carry2) {
7064 // jlong carry, x[], y[], z[];
7065 // int kdx = ystart+1;
7066 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7067 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7068 // z[kdx+idx+1] = (jlong)product;
7069 // jlong carry2 = (jlong)(product >>> 64);
7070 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7071 // z[kdx+idx] = (jlong)product;
7072 // carry = (jlong)(product >>> 64);
7073 // }
7074 // idx += 2;
7075 // if (idx > 0) {
7076 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7077 // z[kdx+idx] = (jlong)product;
7078 // carry = (jlong)(product >>> 64);
7079 // }
7080 //
7081
7082 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7083
7084 movl(jdx, idx);
7085 andl(jdx, 0xFFFFFFFC);
7086 shrl(jdx, 2);
7087
7088 bind(L_third_loop);
7089 subl(jdx, 1);
7090 jcc(Assembler::negative, L_third_loop_exit);
7091 subl(idx, 4);
7092
7093 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7094 movq(carry2, rdx);
7095
7096 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7097 movq(carry, rdx);
7098 jmp(L_third_loop);
7099
7100 bind (L_third_loop_exit);
7101
7102 andl (idx, 0x3);
7103 jcc(Assembler::zero, L_post_third_loop_done);
7104
7105 Label L_check_1;
7106 subl(idx, 2);
7107 jcc(Assembler::negative, L_check_1);
7108
7109 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7110 movq(carry, rdx);
7111
7112 bind (L_check_1);
7113 addl (idx, 0x2);
7114 andl (idx, 0x1);
7115 subl(idx, 1);
7116 jcc(Assembler::negative, L_post_third_loop_done);
7117
7118 movl(yz_idx, Address(y, idx, Address::times_4, 0));
7119 movq(product, x_xstart);
7120 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7121 movl(yz_idx, Address(z, idx, Address::times_4, 0));
7122
7123 add2_with_carry(rdx, product, yz_idx, carry);
7124
7125 movl(Address(z, idx, Address::times_4, 0), product);
7126 shrq(product, 32);
7127
7128 shlq(rdx, 32);
7129 orq(product, rdx);
7130 movq(carry, product);
7131
7132 bind(L_post_third_loop_done);
7133 }
7134
7135 /**
7136 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7137 *
7138 */
7139 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7140 Register carry, Register carry2,
7141 Register idx, Register jdx,
7142 Register yz_idx1, Register yz_idx2,
7143 Register tmp, Register tmp3, Register tmp4) {
7144 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7145
7146 // jlong carry, x[], y[], z[];
7147 // int kdx = ystart+1;
7148 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7149 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7150 // jlong carry2 = (jlong)(tmp3 >>> 64);
7151 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
7152 // carry = (jlong)(tmp4 >>> 64);
7153 // z[kdx+idx+1] = (jlong)tmp3;
7154 // z[kdx+idx] = (jlong)tmp4;
7155 // }
7156 // idx += 2;
7157 // if (idx > 0) {
7158 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7159 // z[kdx+idx] = (jlong)yz_idx1;
7160 // carry = (jlong)(yz_idx1 >>> 64);
7161 // }
7162 //
7163
7164 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7165
7166 movl(jdx, idx);
7167 andl(jdx, 0xFFFFFFFC);
7168 shrl(jdx, 2);
7169
7170 bind(L_third_loop);
7171 subl(jdx, 1);
7172 jcc(Assembler::negative, L_third_loop_exit);
7173 subl(idx, 4);
7174
7175 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
7176 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7177 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
7178 rorxq(yz_idx2, yz_idx2, 32);
7179
7180 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7181 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
7182
7183 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
7184 rorxq(yz_idx1, yz_idx1, 32);
7185 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7186 rorxq(yz_idx2, yz_idx2, 32);
7187
7188 if (VM_Version::supports_adx()) {
7189 adcxq(tmp3, carry);
7190 adoxq(tmp3, yz_idx1);
7191
7192 adcxq(tmp4, tmp);
7193 adoxq(tmp4, yz_idx2);
7194
7195 movl(carry, 0); // does not affect flags
7196 adcxq(carry2, carry);
7197 adoxq(carry2, carry);
7198 } else {
7199 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7200 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7201 }
7202 movq(carry, carry2);
7203
7204 movl(Address(z, idx, Address::times_4, 12), tmp3);
7205 shrq(tmp3, 32);
7206 movl(Address(z, idx, Address::times_4, 8), tmp3);
7207
7208 movl(Address(z, idx, Address::times_4, 4), tmp4);
7209 shrq(tmp4, 32);
7210 movl(Address(z, idx, Address::times_4, 0), tmp4);
7211
7212 jmp(L_third_loop);
7213
7214 bind (L_third_loop_exit);
7215
7216 andl (idx, 0x3);
7217 jcc(Assembler::zero, L_post_third_loop_done);
7218
7219 Label L_check_1;
7220 subl(idx, 2);
7221 jcc(Assembler::negative, L_check_1);
7222
7223 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
7224 rorxq(yz_idx1, yz_idx1, 32);
7225 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7226 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7227 rorxq(yz_idx2, yz_idx2, 32);
7228
7229 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7230
7231 movl(Address(z, idx, Address::times_4, 4), tmp3);
7232 shrq(tmp3, 32);
7233 movl(Address(z, idx, Address::times_4, 0), tmp3);
7234 movq(carry, tmp4);
7235
7236 bind (L_check_1);
7237 addl (idx, 0x2);
7238 andl (idx, 0x1);
7239 subl(idx, 1);
7240 jcc(Assembler::negative, L_post_third_loop_done);
7241 movl(tmp4, Address(y, idx, Address::times_4, 0));
7242 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
7243 movl(tmp4, Address(z, idx, Address::times_4, 0));
7244
7245 add2_with_carry(carry2, tmp3, tmp4, carry);
7246
7247 movl(Address(z, idx, Address::times_4, 0), tmp3);
7248 shrq(tmp3, 32);
7249
7250 shlq(carry2, 32);
7251 orq(tmp3, carry2);
7252 movq(carry, tmp3);
7253
7254 bind(L_post_third_loop_done);
7255 }
7256
7257 /**
7258 * Code for BigInteger::multiplyToLen() intrinsic.
7259 *
7260 * rdi: x
7261 * rax: xlen
7262 * rsi: y
7263 * rcx: ylen
7264 * r8: z
7265 * r11: tmp0
7266 * r12: tmp1
7267 * r13: tmp2
7268 * r14: tmp3
7269 * r15: tmp4
7270 * rbx: tmp5
7271 *
7272 */
7273 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
7274 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7275 ShortBranchVerifier sbv(this);
7276 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7277
7278 push(tmp0);
7279 push(tmp1);
7280 push(tmp2);
7281 push(tmp3);
7282 push(tmp4);
7283 push(tmp5);
7284
7285 push(xlen);
7286
7287 const Register idx = tmp1;
7288 const Register kdx = tmp2;
7289 const Register xstart = tmp3;
7290
7291 const Register y_idx = tmp4;
7292 const Register carry = tmp5;
7293 const Register product = xlen;
7294 const Register x_xstart = tmp0;
7295
7296 // First Loop.
7297 //
7298 // final static long LONG_MASK = 0xffffffffL;
7299 // int xstart = xlen - 1;
7300 // int ystart = ylen - 1;
7301 // long carry = 0;
7302 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7303 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7304 // z[kdx] = (int)product;
7305 // carry = product >>> 32;
7306 // }
7307 // z[xstart] = (int)carry;
7308 //
7309
7310 movl(idx, ylen); // idx = ylen;
7311 lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen;
7312 xorq(carry, carry); // carry = 0;
7313
7314 Label L_done;
7315
7316 movl(xstart, xlen);
7317 decrementl(xstart);
7318 jcc(Assembler::negative, L_done);
7319
7320 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7321
7322 Label L_second_loop;
7323 testl(kdx, kdx);
7324 jcc(Assembler::zero, L_second_loop);
7325
7326 Label L_carry;
7327 subl(kdx, 1);
7328 jcc(Assembler::zero, L_carry);
7329
7330 movl(Address(z, kdx, Address::times_4, 0), carry);
7331 shrq(carry, 32);
7332 subl(kdx, 1);
7333
7334 bind(L_carry);
7335 movl(Address(z, kdx, Address::times_4, 0), carry);
7336
7337 // Second and third (nested) loops.
7338 //
7339 // for (int i = xstart-1; i >= 0; i--) { // Second loop
7340 // carry = 0;
7341 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7342 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7343 // (z[k] & LONG_MASK) + carry;
7344 // z[k] = (int)product;
7345 // carry = product >>> 32;
7346 // }
7347 // z[i] = (int)carry;
7348 // }
7349 //
7350 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7351
7352 const Register jdx = tmp1;
7353
7354 bind(L_second_loop);
7355 xorl(carry, carry); // carry = 0;
7356 movl(jdx, ylen); // j = ystart+1
7357
7358 subl(xstart, 1); // i = xstart-1;
7359 jcc(Assembler::negative, L_done);
7360
7361 push (z);
7362
7363 Label L_last_x;
7364 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7365 subl(xstart, 1); // i = xstart-1;
7366 jcc(Assembler::negative, L_last_x);
7367
7368 if (UseBMI2Instructions) {
7369 movq(rdx, Address(x, xstart, Address::times_4, 0));
7370 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7371 } else {
7372 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7373 rorq(x_xstart, 32); // convert big-endian to little-endian
7374 }
7375
7376 Label L_third_loop_prologue;
7377 bind(L_third_loop_prologue);
7378
7379 push (x);
7380 push (xstart);
7381 push (ylen);
7382
7383
7384 if (UseBMI2Instructions) {
7385 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7386 } else { // !UseBMI2Instructions
7387 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7388 }
7389
7390 pop(ylen);
7391 pop(xlen);
7392 pop(x);
7393 pop(z);
7394
7395 movl(tmp3, xlen);
7396 addl(tmp3, 1);
7397 movl(Address(z, tmp3, Address::times_4, 0), carry);
7398 subl(tmp3, 1);
7399 jccb(Assembler::negative, L_done);
7400
7401 shrq(carry, 32);
7402 movl(Address(z, tmp3, Address::times_4, 0), carry);
7403 jmp(L_second_loop);
7404
7405 // Next infrequent code is moved outside loops.
7406 bind(L_last_x);
7407 if (UseBMI2Instructions) {
7408 movl(rdx, Address(x, 0));
7409 } else {
7410 movl(x_xstart, Address(x, 0));
7411 }
7412 jmp(L_third_loop_prologue);
7413
7414 bind(L_done);
7415
7416 pop(xlen);
7417
7418 pop(tmp5);
7419 pop(tmp4);
7420 pop(tmp3);
7421 pop(tmp2);
7422 pop(tmp1);
7423 pop(tmp0);
7424 }
7425
7426 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
7427 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
7428 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
7429 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
7430 Label VECTOR8_TAIL, VECTOR4_TAIL;
7431 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
7432 Label SAME_TILL_END, DONE;
7433 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
7434
7435 //scale is in rcx in both Win64 and Unix
7436 ShortBranchVerifier sbv(this);
7437
7438 shlq(length);
7439 xorq(result, result);
7440
7441 if ((AVX3Threshold == 0) && (UseAVX > 2) &&
7442 VM_Version::supports_avx512vlbw()) {
7443 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
7444
7445 cmpq(length, 64);
7446 jcc(Assembler::less, VECTOR32_TAIL);
7447
7448 movq(tmp1, length);
7449 andq(tmp1, 0x3F); // tail count
7450 andq(length, ~(0x3F)); //vector count
7451
7452 bind(VECTOR64_LOOP);
7453 // AVX512 code to compare 64 byte vectors.
7454 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
7455 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
7456 kortestql(k7, k7);
7457 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
7458 addq(result, 64);
7459 subq(length, 64);
7460 jccb(Assembler::notZero, VECTOR64_LOOP);
7461
7462 //bind(VECTOR64_TAIL);
7463 testq(tmp1, tmp1);
7464 jcc(Assembler::zero, SAME_TILL_END);
7465
7466 //bind(VECTOR64_TAIL);
7467 // AVX512 code to compare up to 63 byte vectors.
7468 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
7469 shlxq(tmp2, tmp2, tmp1);
7470 notq(tmp2);
7471 kmovql(k3, tmp2);
7472
7473 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
7474 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
7475
7476 ktestql(k7, k3);
7477 jcc(Assembler::below, SAME_TILL_END); // not mismatch
7478
7479 bind(VECTOR64_NOT_EQUAL);
7480 kmovql(tmp1, k7);
7481 notq(tmp1);
7482 tzcntq(tmp1, tmp1);
7483 addq(result, tmp1);
7484 shrq(result);
7485 jmp(DONE);
7486 bind(VECTOR32_TAIL);
7487 }
7488
7489 cmpq(length, 8);
7490 jcc(Assembler::equal, VECTOR8_LOOP);
7491 jcc(Assembler::less, VECTOR4_TAIL);
7492
7493 if (UseAVX >= 2) {
7494 Label VECTOR16_TAIL, VECTOR32_LOOP;
7495
7496 cmpq(length, 16);
7497 jcc(Assembler::equal, VECTOR16_LOOP);
7498 jcc(Assembler::less, VECTOR8_LOOP);
7499
7500 cmpq(length, 32);
7501 jccb(Assembler::less, VECTOR16_TAIL);
7502
7503 subq(length, 32);
7504 bind(VECTOR32_LOOP);
7505 vmovdqu(rymm0, Address(obja, result));
7506 vmovdqu(rymm1, Address(objb, result));
7507 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
7508 vptest(rymm2, rymm2);
7509 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
7510 addq(result, 32);
7511 subq(length, 32);
7512 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
7513 addq(length, 32);
7514 jcc(Assembler::equal, SAME_TILL_END);
7515 //falling through if less than 32 bytes left //close the branch here.
7516
7517 bind(VECTOR16_TAIL);
7518 cmpq(length, 16);
7519 jccb(Assembler::less, VECTOR8_TAIL);
7520 bind(VECTOR16_LOOP);
7521 movdqu(rymm0, Address(obja, result));
7522 movdqu(rymm1, Address(objb, result));
7523 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
7524 ptest(rymm2, rymm2);
7525 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7526 addq(result, 16);
7527 subq(length, 16);
7528 jcc(Assembler::equal, SAME_TILL_END);
7529 //falling through if less than 16 bytes left
7530 } else {//regular intrinsics
7531
7532 cmpq(length, 16);
7533 jccb(Assembler::less, VECTOR8_TAIL);
7534
7535 subq(length, 16);
7536 bind(VECTOR16_LOOP);
7537 movdqu(rymm0, Address(obja, result));
7538 movdqu(rymm1, Address(objb, result));
7539 pxor(rymm0, rymm1);
7540 ptest(rymm0, rymm0);
7541 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7542 addq(result, 16);
7543 subq(length, 16);
7544 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
7545 addq(length, 16);
7546 jcc(Assembler::equal, SAME_TILL_END);
7547 //falling through if less than 16 bytes left
7548 }
7549
7550 bind(VECTOR8_TAIL);
7551 cmpq(length, 8);
7552 jccb(Assembler::less, VECTOR4_TAIL);
7553 bind(VECTOR8_LOOP);
7554 movq(tmp1, Address(obja, result));
7555 movq(tmp2, Address(objb, result));
7556 xorq(tmp1, tmp2);
7557 testq(tmp1, tmp1);
7558 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
7559 addq(result, 8);
7560 subq(length, 8);
7561 jcc(Assembler::equal, SAME_TILL_END);
7562 //falling through if less than 8 bytes left
7563
7564 bind(VECTOR4_TAIL);
7565 cmpq(length, 4);
7566 jccb(Assembler::less, BYTES_TAIL);
7567 bind(VECTOR4_LOOP);
7568 movl(tmp1, Address(obja, result));
7569 xorl(tmp1, Address(objb, result));
7570 testl(tmp1, tmp1);
7571 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
7572 addq(result, 4);
7573 subq(length, 4);
7574 jcc(Assembler::equal, SAME_TILL_END);
7575 //falling through if less than 4 bytes left
7576
7577 bind(BYTES_TAIL);
7578 bind(BYTES_LOOP);
7579 load_unsigned_byte(tmp1, Address(obja, result));
7580 load_unsigned_byte(tmp2, Address(objb, result));
7581 xorl(tmp1, tmp2);
7582 testl(tmp1, tmp1);
7583 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7584 decq(length);
7585 jcc(Assembler::zero, SAME_TILL_END);
7586 incq(result);
7587 load_unsigned_byte(tmp1, Address(obja, result));
7588 load_unsigned_byte(tmp2, Address(objb, result));
7589 xorl(tmp1, tmp2);
7590 testl(tmp1, tmp1);
7591 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7592 decq(length);
7593 jcc(Assembler::zero, SAME_TILL_END);
7594 incq(result);
7595 load_unsigned_byte(tmp1, Address(obja, result));
7596 load_unsigned_byte(tmp2, Address(objb, result));
7597 xorl(tmp1, tmp2);
7598 testl(tmp1, tmp1);
7599 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7600 jmp(SAME_TILL_END);
7601
7602 if (UseAVX >= 2) {
7603 bind(VECTOR32_NOT_EQUAL);
7604 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
7605 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
7606 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
7607 vpmovmskb(tmp1, rymm0);
7608 bsfq(tmp1, tmp1);
7609 addq(result, tmp1);
7610 shrq(result);
7611 jmp(DONE);
7612 }
7613
7614 bind(VECTOR16_NOT_EQUAL);
7615 if (UseAVX >= 2) {
7616 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
7617 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
7618 pxor(rymm0, rymm2);
7619 } else {
7620 pcmpeqb(rymm2, rymm2);
7621 pxor(rymm0, rymm1);
7622 pcmpeqb(rymm0, rymm1);
7623 pxor(rymm0, rymm2);
7624 }
7625 pmovmskb(tmp1, rymm0);
7626 bsfq(tmp1, tmp1);
7627 addq(result, tmp1);
7628 shrq(result);
7629 jmpb(DONE);
7630
7631 bind(VECTOR8_NOT_EQUAL);
7632 bind(VECTOR4_NOT_EQUAL);
7633 bsfq(tmp1, tmp1);
7634 shrq(tmp1, 3);
7635 addq(result, tmp1);
7636 bind(BYTES_NOT_EQUAL);
7637 shrq(result);
7638 jmpb(DONE);
7639
7640 bind(SAME_TILL_END);
7641 mov64(result, -1);
7642
7643 bind(DONE);
7644 }
7645
7646 //Helper functions for square_to_len()
7647
7648 /**
7649 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7650 * Preserves x and z and modifies rest of the registers.
7651 */
7652 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7653 // Perform square and right shift by 1
7654 // Handle odd xlen case first, then for even xlen do the following
7655 // jlong carry = 0;
7656 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7657 // huge_128 product = x[j:j+1] * x[j:j+1];
7658 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7659 // z[i+2:i+3] = (jlong)(product >>> 1);
7660 // carry = (jlong)product;
7661 // }
7662
7663 xorq(tmp5, tmp5); // carry
7664 xorq(rdxReg, rdxReg);
7665 xorl(tmp1, tmp1); // index for x
7666 xorl(tmp4, tmp4); // index for z
7667
7668 Label L_first_loop, L_first_loop_exit;
7669
7670 testl(xlen, 1);
7671 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7672
7673 // Square and right shift by 1 the odd element using 32 bit multiply
7674 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7675 imulq(raxReg, raxReg);
7676 shrq(raxReg, 1);
7677 adcq(tmp5, 0);
7678 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7679 incrementl(tmp1);
7680 addl(tmp4, 2);
7681
7682 // Square and right shift by 1 the rest using 64 bit multiply
7683 bind(L_first_loop);
7684 cmpptr(tmp1, xlen);
7685 jccb(Assembler::equal, L_first_loop_exit);
7686
7687 // Square
7688 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
7689 rorq(raxReg, 32); // convert big-endian to little-endian
7690 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
7691
7692 // Right shift by 1 and save carry
7693 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7694 rcrq(rdxReg, 1);
7695 rcrq(raxReg, 1);
7696 adcq(tmp5, 0);
7697
7698 // Store result in z
7699 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7700 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7701
7702 // Update indices for x and z
7703 addl(tmp1, 2);
7704 addl(tmp4, 4);
7705 jmp(L_first_loop);
7706
7707 bind(L_first_loop_exit);
7708 }
7709
7710
7711 /**
7712 * Perform the following multiply add operation using BMI2 instructions
7713 * carry:sum = sum + op1*op2 + carry
7714 * op2 should be in rdx
7715 * op2 is preserved, all other registers are modified
7716 */
7717 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7718 // assert op2 is rdx
7719 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
7720 addq(sum, carry);
7721 adcq(tmp2, 0);
7722 addq(sum, op1);
7723 adcq(tmp2, 0);
7724 movq(carry, tmp2);
7725 }
7726
7727 /**
7728 * Perform the following multiply add operation:
7729 * carry:sum = sum + op1*op2 + carry
7730 * Preserves op1, op2 and modifies rest of registers
7731 */
7732 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7733 // rdx:rax = op1 * op2
7734 movq(raxReg, op2);
7735 mulq(op1);
7736
7737 // rdx:rax = sum + carry + rdx:rax
7738 addq(sum, carry);
7739 adcq(rdxReg, 0);
7740 addq(sum, raxReg);
7741 adcq(rdxReg, 0);
7742
7743 // carry:sum = rdx:sum
7744 movq(carry, rdxReg);
7745 }
7746
7747 /**
7748 * Add 64 bit long carry into z[] with carry propagation.
7749 * Preserves z and carry register values and modifies rest of registers.
7750 *
7751 */
7752 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7753 Label L_fourth_loop, L_fourth_loop_exit;
7754
7755 movl(tmp1, 1);
7756 subl(zlen, 2);
7757 addq(Address(z, zlen, Address::times_4, 0), carry);
7758
7759 bind(L_fourth_loop);
7760 jccb(Assembler::carryClear, L_fourth_loop_exit);
7761 subl(zlen, 2);
7762 jccb(Assembler::negative, L_fourth_loop_exit);
7763 addq(Address(z, zlen, Address::times_4, 0), tmp1);
7764 jmp(L_fourth_loop);
7765 bind(L_fourth_loop_exit);
7766 }
7767
7768 /**
7769 * Shift z[] left by 1 bit.
7770 * Preserves x, len, z and zlen registers and modifies rest of the registers.
7771 *
7772 */
7773 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7774
7775 Label L_fifth_loop, L_fifth_loop_exit;
7776
7777 // Fifth loop
7778 // Perform primitiveLeftShift(z, zlen, 1)
7779
7780 const Register prev_carry = tmp1;
7781 const Register new_carry = tmp4;
7782 const Register value = tmp2;
7783 const Register zidx = tmp3;
7784
7785 // int zidx, carry;
7786 // long value;
7787 // carry = 0;
7788 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7789 // (carry:value) = (z[i] << 1) | carry ;
7790 // z[i] = value;
7791 // }
7792
7793 movl(zidx, zlen);
7794 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7795
7796 bind(L_fifth_loop);
7797 decl(zidx); // Use decl to preserve carry flag
7798 decl(zidx);
7799 jccb(Assembler::negative, L_fifth_loop_exit);
7800
7801 if (UseBMI2Instructions) {
7802 movq(value, Address(z, zidx, Address::times_4, 0));
7803 rclq(value, 1);
7804 rorxq(value, value, 32);
7805 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7806 }
7807 else {
7808 // clear new_carry
7809 xorl(new_carry, new_carry);
7810
7811 // Shift z[i] by 1, or in previous carry and save new carry
7812 movq(value, Address(z, zidx, Address::times_4, 0));
7813 shlq(value, 1);
7814 adcl(new_carry, 0);
7815
7816 orq(value, prev_carry);
7817 rorq(value, 0x20);
7818 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7819
7820 // Set previous carry = new carry
7821 movl(prev_carry, new_carry);
7822 }
7823 jmp(L_fifth_loop);
7824
7825 bind(L_fifth_loop_exit);
7826 }
7827
7828
7829 /**
7830 * Code for BigInteger::squareToLen() intrinsic
7831 *
7832 * rdi: x
7833 * rsi: len
7834 * r8: z
7835 * rcx: zlen
7836 * r12: tmp1
7837 * r13: tmp2
7838 * r14: tmp3
7839 * r15: tmp4
7840 * rbx: tmp5
7841 *
7842 */
7843 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7844
7845 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
7846 push(tmp1);
7847 push(tmp2);
7848 push(tmp3);
7849 push(tmp4);
7850 push(tmp5);
7851
7852 // First loop
7853 // Store the squares, right shifted one bit (i.e., divided by 2).
7854 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7855
7856 // Add in off-diagonal sums.
7857 //
7858 // Second, third (nested) and fourth loops.
7859 // zlen +=2;
7860 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7861 // carry = 0;
7862 // long op2 = x[xidx:xidx+1];
7863 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7864 // k -= 2;
7865 // long op1 = x[j:j+1];
7866 // long sum = z[k:k+1];
7867 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7868 // z[k:k+1] = sum;
7869 // }
7870 // add_one_64(z, k, carry, tmp_regs);
7871 // }
7872
7873 const Register carry = tmp5;
7874 const Register sum = tmp3;
7875 const Register op1 = tmp4;
7876 Register op2 = tmp2;
7877
7878 push(zlen);
7879 push(len);
7880 addl(zlen,2);
7881 bind(L_second_loop);
7882 xorq(carry, carry);
7883 subl(zlen, 4);
7884 subl(len, 2);
7885 push(zlen);
7886 push(len);
7887 cmpl(len, 0);
7888 jccb(Assembler::lessEqual, L_second_loop_exit);
7889
7890 // Multiply an array by one 64 bit long.
7891 if (UseBMI2Instructions) {
7892 op2 = rdxReg;
7893 movq(op2, Address(x, len, Address::times_4, 0));
7894 rorxq(op2, op2, 32);
7895 }
7896 else {
7897 movq(op2, Address(x, len, Address::times_4, 0));
7898 rorq(op2, 32);
7899 }
7900
7901 bind(L_third_loop);
7902 decrementl(len);
7903 jccb(Assembler::negative, L_third_loop_exit);
7904 decrementl(len);
7905 jccb(Assembler::negative, L_last_x);
7906
7907 movq(op1, Address(x, len, Address::times_4, 0));
7908 rorq(op1, 32);
7909
7910 bind(L_multiply);
7911 subl(zlen, 2);
7912 movq(sum, Address(z, zlen, Address::times_4, 0));
7913
7914 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
7915 if (UseBMI2Instructions) {
7916 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
7917 }
7918 else {
7919 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7920 }
7921
7922 movq(Address(z, zlen, Address::times_4, 0), sum);
7923
7924 jmp(L_third_loop);
7925 bind(L_third_loop_exit);
7926
7927 // Fourth loop
7928 // Add 64 bit long carry into z with carry propagation.
7929 // Uses offsetted zlen.
7930 add_one_64(z, zlen, carry, tmp1);
7931
7932 pop(len);
7933 pop(zlen);
7934 jmp(L_second_loop);
7935
7936 // Next infrequent code is moved outside loops.
7937 bind(L_last_x);
7938 movl(op1, Address(x, 0));
7939 jmp(L_multiply);
7940
7941 bind(L_second_loop_exit);
7942 pop(len);
7943 pop(zlen);
7944 pop(len);
7945 pop(zlen);
7946
7947 // Fifth loop
7948 // Shift z left 1 bit.
7949 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7950
7951 // z[zlen-1] |= x[len-1] & 1;
7952 movl(tmp3, Address(x, len, Address::times_4, -4));
7953 andl(tmp3, 1);
7954 orl(Address(z, zlen, Address::times_4, -4), tmp3);
7955
7956 pop(tmp5);
7957 pop(tmp4);
7958 pop(tmp3);
7959 pop(tmp2);
7960 pop(tmp1);
7961 }
7962
7963 /**
7964 * Helper function for mul_add()
7965 * Multiply the in[] by int k and add to out[] starting at offset offs using
7966 * 128 bit by 32 bit multiply and return the carry in tmp5.
7967 * Only quad int aligned length of in[] is operated on in this function.
7968 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7969 * This function preserves out, in and k registers.
7970 * len and offset point to the appropriate index in "in" & "out" correspondingly
7971 * tmp5 has the carry.
7972 * other registers are temporary and are modified.
7973 *
7974 */
7975 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7976 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7977 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7978
7979 Label L_first_loop, L_first_loop_exit;
7980
7981 movl(tmp1, len);
7982 shrl(tmp1, 2);
7983
7984 bind(L_first_loop);
7985 subl(tmp1, 1);
7986 jccb(Assembler::negative, L_first_loop_exit);
7987
7988 subl(len, 4);
7989 subl(offset, 4);
7990
7991 Register op2 = tmp2;
7992 const Register sum = tmp3;
7993 const Register op1 = tmp4;
7994 const Register carry = tmp5;
7995
7996 if (UseBMI2Instructions) {
7997 op2 = rdxReg;
7998 }
7999
8000 movq(op1, Address(in, len, Address::times_4, 8));
8001 rorq(op1, 32);
8002 movq(sum, Address(out, offset, Address::times_4, 8));
8003 rorq(sum, 32);
8004 if (UseBMI2Instructions) {
8005 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8006 }
8007 else {
8008 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8009 }
8010 // Store back in big endian from little endian
8011 rorq(sum, 0x20);
8012 movq(Address(out, offset, Address::times_4, 8), sum);
8013
8014 movq(op1, Address(in, len, Address::times_4, 0));
8015 rorq(op1, 32);
8016 movq(sum, Address(out, offset, Address::times_4, 0));
8017 rorq(sum, 32);
8018 if (UseBMI2Instructions) {
8019 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8020 }
8021 else {
8022 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8023 }
8024 // Store back in big endian from little endian
8025 rorq(sum, 0x20);
8026 movq(Address(out, offset, Address::times_4, 0), sum);
8027
8028 jmp(L_first_loop);
8029 bind(L_first_loop_exit);
8030 }
8031
8032 /**
8033 * Code for BigInteger::mulAdd() intrinsic
8034 *
8035 * rdi: out
8036 * rsi: in
8037 * r11: offs (out.length - offset)
8038 * rcx: len
8039 * r8: k
8040 * r12: tmp1
8041 * r13: tmp2
8042 * r14: tmp3
8043 * r15: tmp4
8044 * rbx: tmp5
8045 * Multiply the in[] by word k and add to out[], return the carry in rax
8046 */
8047 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8048 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8049 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8050
8051 Label L_carry, L_last_in, L_done;
8052
8053 // carry = 0;
8054 // for (int j=len-1; j >= 0; j--) {
8055 // long product = (in[j] & LONG_MASK) * kLong +
8056 // (out[offs] & LONG_MASK) + carry;
8057 // out[offs--] = (int)product;
8058 // carry = product >>> 32;
8059 // }
8060 //
8061 push(tmp1);
8062 push(tmp2);
8063 push(tmp3);
8064 push(tmp4);
8065 push(tmp5);
8066
8067 Register op2 = tmp2;
8068 const Register sum = tmp3;
8069 const Register op1 = tmp4;
8070 const Register carry = tmp5;
8071
8072 if (UseBMI2Instructions) {
8073 op2 = rdxReg;
8074 movl(op2, k);
8075 }
8076 else {
8077 movl(op2, k);
8078 }
8079
8080 xorq(carry, carry);
8081
8082 //First loop
8083
8084 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8085 //The carry is in tmp5
8086 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8087
8088 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8089 decrementl(len);
8090 jccb(Assembler::negative, L_carry);
8091 decrementl(len);
8092 jccb(Assembler::negative, L_last_in);
8093
8094 movq(op1, Address(in, len, Address::times_4, 0));
8095 rorq(op1, 32);
8096
8097 subl(offs, 2);
8098 movq(sum, Address(out, offs, Address::times_4, 0));
8099 rorq(sum, 32);
8100
8101 if (UseBMI2Instructions) {
8102 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8103 }
8104 else {
8105 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8106 }
8107
8108 // Store back in big endian from little endian
8109 rorq(sum, 0x20);
8110 movq(Address(out, offs, Address::times_4, 0), sum);
8111
8112 testl(len, len);
8113 jccb(Assembler::zero, L_carry);
8114
8115 //Multiply the last in[] entry, if any
8116 bind(L_last_in);
8117 movl(op1, Address(in, 0));
8118 movl(sum, Address(out, offs, Address::times_4, -4));
8119
8120 movl(raxReg, k);
8121 mull(op1); //tmp4 * eax -> edx:eax
8122 addl(sum, carry);
8123 adcl(rdxReg, 0);
8124 addl(sum, raxReg);
8125 adcl(rdxReg, 0);
8126 movl(carry, rdxReg);
8127
8128 movl(Address(out, offs, Address::times_4, -4), sum);
8129
8130 bind(L_carry);
8131 //return tmp5/carry as carry in rax
8132 movl(rax, carry);
8133
8134 bind(L_done);
8135 pop(tmp5);
8136 pop(tmp4);
8137 pop(tmp3);
8138 pop(tmp2);
8139 pop(tmp1);
8140 }
8141
8142 /**
8143 * Emits code to update CRC-32 with a byte value according to constants in table
8144 *
8145 * @param [in,out]crc Register containing the crc.
8146 * @param [in]val Register containing the byte to fold into the CRC.
8147 * @param [in]table Register containing the table of crc constants.
8148 *
8149 * uint32_t crc;
8150 * val = crc_table[(val ^ crc) & 0xFF];
8151 * crc = val ^ (crc >> 8);
8152 *
8153 */
8154 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8155 xorl(val, crc);
8156 andl(val, 0xFF);
8157 shrl(crc, 8); // unsigned shift
8158 xorl(crc, Address(table, val, Address::times_4, 0));
8159 }
8160
8161 /**
8162 * Fold 128-bit data chunk
8163 */
8164 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8165 if (UseAVX > 0) {
8166 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
8167 vpclmulldq(xcrc, xK, xcrc); // [63:0]
8168 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
8169 pxor(xcrc, xtmp);
8170 } else {
8171 movdqa(xtmp, xcrc);
8172 pclmulhdq(xtmp, xK); // [123:64]
8173 pclmulldq(xcrc, xK); // [63:0]
8174 pxor(xcrc, xtmp);
8175 movdqu(xtmp, Address(buf, offset));
8176 pxor(xcrc, xtmp);
8177 }
8178 }
8179
8180 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
8181 if (UseAVX > 0) {
8182 vpclmulhdq(xtmp, xK, xcrc);
8183 vpclmulldq(xcrc, xK, xcrc);
8184 pxor(xcrc, xbuf);
8185 pxor(xcrc, xtmp);
8186 } else {
8187 movdqa(xtmp, xcrc);
8188 pclmulhdq(xtmp, xK);
8189 pclmulldq(xcrc, xK);
8190 pxor(xcrc, xbuf);
8191 pxor(xcrc, xtmp);
8192 }
8193 }
8194
8195 /**
8196 * 8-bit folds to compute 32-bit CRC
8197 *
8198 * uint64_t xcrc;
8199 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
8200 */
8201 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
8202 movdl(tmp, xcrc);
8203 andl(tmp, 0xFF);
8204 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
8205 psrldq(xcrc, 1); // unsigned shift one byte
8206 pxor(xcrc, xtmp);
8207 }
8208
8209 /**
8210 * uint32_t crc;
8211 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
8212 */
8213 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
8214 movl(tmp, crc);
8215 andl(tmp, 0xFF);
8216 shrl(crc, 8);
8217 xorl(crc, Address(table, tmp, Address::times_4, 0));
8218 }
8219
8220 /**
8221 * @param crc register containing existing CRC (32-bit)
8222 * @param buf register pointing to input byte buffer (byte*)
8223 * @param len register containing number of bytes
8224 * @param table register that will contain address of CRC table
8225 * @param tmp scratch register
8226 */
8227 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
8228 assert_different_registers(crc, buf, len, table, tmp, rax);
8229
8230 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8231 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8232
8233 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8234 // context for the registers used, where all instructions below are using 128-bit mode
8235 // On EVEX without VL and BW, these instructions will all be AVX.
8236 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
8237 notl(crc); // ~crc
8238 cmpl(len, 16);
8239 jcc(Assembler::less, L_tail);
8240
8241 // Align buffer to 16 bytes
8242 movl(tmp, buf);
8243 andl(tmp, 0xF);
8244 jccb(Assembler::zero, L_aligned);
8245 subl(tmp, 16);
8246 addl(len, tmp);
8247
8248 align(4);
8249 BIND(L_align_loop);
8250 movsbl(rax, Address(buf, 0)); // load byte with sign extension
8251 update_byte_crc32(crc, rax, table);
8252 increment(buf);
8253 incrementl(tmp);
8254 jccb(Assembler::less, L_align_loop);
8255
8256 BIND(L_aligned);
8257 movl(tmp, len); // save
8258 shrl(len, 4);
8259 jcc(Assembler::zero, L_tail_restore);
8260
8261 // Fold crc into first bytes of vector
8262 movdqa(xmm1, Address(buf, 0));
8263 movdl(rax, xmm1);
8264 xorl(crc, rax);
8265 if (VM_Version::supports_sse4_1()) {
8266 pinsrd(xmm1, crc, 0);
8267 } else {
8268 pinsrw(xmm1, crc, 0);
8269 shrl(crc, 16);
8270 pinsrw(xmm1, crc, 1);
8271 }
8272 addptr(buf, 16);
8273 subl(len, 4); // len > 0
8274 jcc(Assembler::less, L_fold_tail);
8275
8276 movdqa(xmm2, Address(buf, 0));
8277 movdqa(xmm3, Address(buf, 16));
8278 movdqa(xmm4, Address(buf, 32));
8279 addptr(buf, 48);
8280 subl(len, 3);
8281 jcc(Assembler::lessEqual, L_fold_512b);
8282
8283 // Fold total 512 bits of polynomial on each iteration,
8284 // 128 bits per each of 4 parallel streams.
8285 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
8286
8287 align32();
8288 BIND(L_fold_512b_loop);
8289 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
8290 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
8291 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
8292 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
8293 addptr(buf, 64);
8294 subl(len, 4);
8295 jcc(Assembler::greater, L_fold_512b_loop);
8296
8297 // Fold 512 bits to 128 bits.
8298 BIND(L_fold_512b);
8299 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
8300 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
8301 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
8302 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
8303
8304 // Fold the rest of 128 bits data chunks
8305 BIND(L_fold_tail);
8306 addl(len, 3);
8307 jccb(Assembler::lessEqual, L_fold_128b);
8308 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
8309
8310 BIND(L_fold_tail_loop);
8311 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
8312 addptr(buf, 16);
8313 decrementl(len);
8314 jccb(Assembler::greater, L_fold_tail_loop);
8315
8316 // Fold 128 bits in xmm1 down into 32 bits in crc register.
8317 BIND(L_fold_128b);
8318 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
8319 if (UseAVX > 0) {
8320 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
8321 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
8322 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
8323 } else {
8324 movdqa(xmm2, xmm0);
8325 pclmulqdq(xmm2, xmm1, 0x1);
8326 movdqa(xmm3, xmm0);
8327 pand(xmm3, xmm2);
8328 pclmulqdq(xmm0, xmm3, 0x1);
8329 }
8330 psrldq(xmm1, 8);
8331 psrldq(xmm2, 4);
8332 pxor(xmm0, xmm1);
8333 pxor(xmm0, xmm2);
8334
8335 // 8 8-bit folds to compute 32-bit CRC.
8336 for (int j = 0; j < 4; j++) {
8337 fold_8bit_crc32(xmm0, table, xmm1, rax);
8338 }
8339 movdl(crc, xmm0); // mov 32 bits to general register
8340 for (int j = 0; j < 4; j++) {
8341 fold_8bit_crc32(crc, table, rax);
8342 }
8343
8344 BIND(L_tail_restore);
8345 movl(len, tmp); // restore
8346 BIND(L_tail);
8347 andl(len, 0xf);
8348 jccb(Assembler::zero, L_exit);
8349
8350 // Fold the rest of bytes
8351 align(4);
8352 BIND(L_tail_loop);
8353 movsbl(rax, Address(buf, 0)); // load byte with sign extension
8354 update_byte_crc32(crc, rax, table);
8355 increment(buf);
8356 decrementl(len);
8357 jccb(Assembler::greater, L_tail_loop);
8358
8359 BIND(L_exit);
8360 notl(crc); // ~c
8361 }
8362
8363 // Helper function for AVX 512 CRC32
8364 // Fold 512-bit data chunks
8365 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
8366 Register pos, int offset) {
8367 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
8368 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
8369 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
8370 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
8371 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
8372 }
8373
8374 // Helper function for AVX 512 CRC32
8375 // Compute CRC32 for < 256B buffers
8376 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
8377 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
8378 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
8379
8380 Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
8381 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
8382 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
8383
8384 // check if there is enough buffer to be able to fold 16B at a time
8385 cmpl(len, 32);
8386 jcc(Assembler::less, L_less_than_32);
8387
8388 // if there is, load the constants
8389 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
8390 movdl(xmm0, crc); // get the initial crc value
8391 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
8392 pxor(xmm7, xmm0);
8393
8394 // update the buffer pointer
8395 addl(pos, 16);
8396 //update the counter.subtract 32 instead of 16 to save one instruction from the loop
8397 subl(len, 32);
8398 jmp(L_16B_reduction_loop);
8399
8400 bind(L_less_than_32);
8401 //mov initial crc to the return value. this is necessary for zero - length buffers.
8402 movl(rax, crc);
8403 testl(len, len);
8404 jcc(Assembler::equal, L_cleanup);
8405
8406 movdl(xmm0, crc); //get the initial crc value
8407
8408 cmpl(len, 16);
8409 jcc(Assembler::equal, L_exact_16_left);
8410 jcc(Assembler::less, L_less_than_16_left);
8411
8412 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
8413 pxor(xmm7, xmm0); //xor the initial crc value
8414 addl(pos, 16);
8415 subl(len, 16);
8416 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
8417 jmp(L_get_last_two_xmms);
8418
8419 bind(L_less_than_16_left);
8420 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
8421 pxor(xmm1, xmm1);
8422 movptr(tmp1, rsp);
8423 movdqu(Address(tmp1, 0 * 16), xmm1);
8424
8425 cmpl(len, 4);
8426 jcc(Assembler::less, L_only_less_than_4);
8427
8428 //backup the counter value
8429 movl(tmp2, len);
8430 cmpl(len, 8);
8431 jcc(Assembler::less, L_less_than_8_left);
8432
8433 //load 8 Bytes
8434 movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
8435 movq(Address(tmp1, 0 * 16), rax);
8436 addptr(tmp1, 8);
8437 subl(len, 8);
8438 addl(pos, 8);
8439
8440 bind(L_less_than_8_left);
8441 cmpl(len, 4);
8442 jcc(Assembler::less, L_less_than_4_left);
8443
8444 //load 4 Bytes
8445 movl(rax, Address(buf, pos, Address::times_1, 0));
8446 movl(Address(tmp1, 0 * 16), rax);
8447 addptr(tmp1, 4);
8448 subl(len, 4);
8449 addl(pos, 4);
8450
8451 bind(L_less_than_4_left);
8452 cmpl(len, 2);
8453 jcc(Assembler::less, L_less_than_2_left);
8454
8455 // load 2 Bytes
8456 movw(rax, Address(buf, pos, Address::times_1, 0));
8457 movl(Address(tmp1, 0 * 16), rax);
8458 addptr(tmp1, 2);
8459 subl(len, 2);
8460 addl(pos, 2);
8461
8462 bind(L_less_than_2_left);
8463 cmpl(len, 1);
8464 jcc(Assembler::less, L_zero_left);
8465
8466 // load 1 Byte
8467 movb(rax, Address(buf, pos, Address::times_1, 0));
8468 movb(Address(tmp1, 0 * 16), rax);
8469
8470 bind(L_zero_left);
8471 movdqu(xmm7, Address(rsp, 0));
8472 pxor(xmm7, xmm0); //xor the initial crc value
8473
8474 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8475 movdqu(xmm0, Address(rax, tmp2));
8476 pshufb(xmm7, xmm0);
8477 jmp(L_128_done);
8478
8479 bind(L_exact_16_left);
8480 movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
8481 pxor(xmm7, xmm0); //xor the initial crc value
8482 jmp(L_128_done);
8483
8484 bind(L_only_less_than_4);
8485 cmpl(len, 3);
8486 jcc(Assembler::less, L_only_less_than_3);
8487
8488 // load 3 Bytes
8489 movb(rax, Address(buf, pos, Address::times_1, 0));
8490 movb(Address(tmp1, 0), rax);
8491
8492 movb(rax, Address(buf, pos, Address::times_1, 1));
8493 movb(Address(tmp1, 1), rax);
8494
8495 movb(rax, Address(buf, pos, Address::times_1, 2));
8496 movb(Address(tmp1, 2), rax);
8497
8498 movdqu(xmm7, Address(rsp, 0));
8499 pxor(xmm7, xmm0); //xor the initial crc value
8500
8501 pslldq(xmm7, 0x5);
8502 jmp(L_barrett);
8503 bind(L_only_less_than_3);
8504 cmpl(len, 2);
8505 jcc(Assembler::less, L_only_less_than_2);
8506
8507 // load 2 Bytes
8508 movb(rax, Address(buf, pos, Address::times_1, 0));
8509 movb(Address(tmp1, 0), rax);
8510
8511 movb(rax, Address(buf, pos, Address::times_1, 1));
8512 movb(Address(tmp1, 1), rax);
8513
8514 movdqu(xmm7, Address(rsp, 0));
8515 pxor(xmm7, xmm0); //xor the initial crc value
8516
8517 pslldq(xmm7, 0x6);
8518 jmp(L_barrett);
8519
8520 bind(L_only_less_than_2);
8521 //load 1 Byte
8522 movb(rax, Address(buf, pos, Address::times_1, 0));
8523 movb(Address(tmp1, 0), rax);
8524
8525 movdqu(xmm7, Address(rsp, 0));
8526 pxor(xmm7, xmm0); //xor the initial crc value
8527
8528 pslldq(xmm7, 0x7);
8529 }
8530
8531 /**
8532 * Compute CRC32 using AVX512 instructions
8533 * param crc register containing existing CRC (32-bit)
8534 * param buf register pointing to input byte buffer (byte*)
8535 * param len register containing number of bytes
8536 * param table address of crc or crc32c table
8537 * param tmp1 scratch register
8538 * param tmp2 scratch register
8539 * return rax result register
8540 *
8541 * This routine is identical for crc32c with the exception of the precomputed constant
8542 * table which will be passed as the table argument. The calculation steps are
8543 * the same for both variants.
8544 */
8545 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
8546 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
8547
8548 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8549 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8550 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
8551 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
8552 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
8553
8554 const Register pos = r12;
8555 push(r12);
8556 subptr(rsp, 16 * 2 + 8);
8557
8558 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8559 // context for the registers used, where all instructions below are using 128-bit mode
8560 // On EVEX without VL and BW, these instructions will all be AVX.
8561 movl(pos, 0);
8562
8563 // check if smaller than 256B
8564 cmpl(len, 256);
8565 jcc(Assembler::less, L_less_than_256);
8566
8567 // load the initial crc value
8568 movdl(xmm10, crc);
8569
8570 // receive the initial 64B data, xor the initial crc value
8571 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
8572 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
8573 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
8574 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
8575
8576 subl(len, 256);
8577 cmpl(len, 256);
8578 jcc(Assembler::less, L_fold_128_B_loop);
8579
8580 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
8581 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
8582 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
8583 subl(len, 256);
8584
8585 bind(L_fold_256_B_loop);
8586 addl(pos, 256);
8587 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
8588 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
8589 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
8590 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
8591
8592 subl(len, 256);
8593 jcc(Assembler::greaterEqual, L_fold_256_B_loop);
8594
8595 // Fold 256 into 128
8596 addl(pos, 256);
8597 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
8598 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
8599 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
8600
8601 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
8602 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
8603 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
8604
8605 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
8606 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
8607
8608 addl(len, 128);
8609 jmp(L_fold_128_B_register);
8610
8611 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
8612 // loop will fold 128B at a time until we have 128 + y Bytes of buffer
8613
8614 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
8615 bind(L_fold_128_B_loop);
8616 addl(pos, 128);
8617 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
8618 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
8619
8620 subl(len, 128);
8621 jcc(Assembler::greaterEqual, L_fold_128_B_loop);
8622
8623 addl(pos, 128);
8624
8625 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
8626 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
8627 bind(L_fold_128_B_register);
8628 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
8629 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
8630 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
8631 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
8632 // save last that has no multiplicand
8633 vextracti64x2(xmm7, xmm4, 3);
8634
8635 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
8636 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
8637 // Needed later in reduction loop
8638 movdqu(xmm10, Address(table, 1 * 16));
8639 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
8640 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
8641
8642 // Swap 1,0,3,2 - 01 00 11 10
8643 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
8644 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
8645 vextracti128(xmm5, xmm8, 1);
8646 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
8647
8648 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
8649 // instead of a cmp instruction, we use the negative flag with the jl instruction
8650 addl(len, 128 - 16);
8651 jcc(Assembler::less, L_final_reduction_for_128);
8652
8653 bind(L_16B_reduction_loop);
8654 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8655 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8656 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8657 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
8658 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8659 addl(pos, 16);
8660 subl(len, 16);
8661 jcc(Assembler::greaterEqual, L_16B_reduction_loop);
8662
8663 bind(L_final_reduction_for_128);
8664 addl(len, 16);
8665 jcc(Assembler::equal, L_128_done);
8666
8667 bind(L_get_last_two_xmms);
8668 movdqu(xmm2, xmm7);
8669 addl(pos, len);
8670 movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
8671 subl(pos, len);
8672
8673 // get rid of the extra data that was loaded before
8674 // load the shift constant
8675 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8676 movdqu(xmm0, Address(rax, len));
8677 addl(rax, len);
8678
8679 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8680 //Change mask to 512
8681 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
8682 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
8683
8684 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
8685 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8686 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8687 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8688 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
8689
8690 bind(L_128_done);
8691 // compute crc of a 128-bit value
8692 movdqu(xmm10, Address(table, 3 * 16));
8693 movdqu(xmm0, xmm7);
8694
8695 // 64b fold
8696 vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
8697 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
8698 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8699
8700 // 32b fold
8701 movdqu(xmm0, xmm7);
8702 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
8703 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8704 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8705 jmp(L_barrett);
8706
8707 bind(L_less_than_256);
8708 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
8709
8710 //barrett reduction
8711 bind(L_barrett);
8712 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
8713 movdqu(xmm1, xmm7);
8714 movdqu(xmm2, xmm7);
8715 movdqu(xmm10, Address(table, 4 * 16));
8716
8717 pclmulqdq(xmm7, xmm10, 0x0);
8718 pxor(xmm7, xmm2);
8719 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
8720 movdqu(xmm2, xmm7);
8721 pclmulqdq(xmm7, xmm10, 0x10);
8722 pxor(xmm7, xmm2);
8723 pxor(xmm7, xmm1);
8724 pextrd(crc, xmm7, 2);
8725
8726 bind(L_cleanup);
8727 addptr(rsp, 16 * 2 + 8);
8728 pop(r12);
8729 }
8730
8731 // S. Gueron / Information Processing Letters 112 (2012) 184
8732 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8733 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8734 // Output: the 64-bit carry-less product of B * CONST
8735 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8736 Register tmp1, Register tmp2, Register tmp3) {
8737 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8738 if (n > 0) {
8739 addq(tmp3, n * 256 * 8);
8740 }
8741 // Q1 = TABLEExt[n][B & 0xFF];
8742 movl(tmp1, in);
8743 andl(tmp1, 0x000000FF);
8744 shll(tmp1, 3);
8745 addq(tmp1, tmp3);
8746 movq(tmp1, Address(tmp1, 0));
8747
8748 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
8749 movl(tmp2, in);
8750 shrl(tmp2, 8);
8751 andl(tmp2, 0x000000FF);
8752 shll(tmp2, 3);
8753 addq(tmp2, tmp3);
8754 movq(tmp2, Address(tmp2, 0));
8755
8756 shlq(tmp2, 8);
8757 xorq(tmp1, tmp2);
8758
8759 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
8760 movl(tmp2, in);
8761 shrl(tmp2, 16);
8762 andl(tmp2, 0x000000FF);
8763 shll(tmp2, 3);
8764 addq(tmp2, tmp3);
8765 movq(tmp2, Address(tmp2, 0));
8766
8767 shlq(tmp2, 16);
8768 xorq(tmp1, tmp2);
8769
8770 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
8771 shrl(in, 24);
8772 andl(in, 0x000000FF);
8773 shll(in, 3);
8774 addq(in, tmp3);
8775 movq(in, Address(in, 0));
8776
8777 shlq(in, 24);
8778 xorq(in, tmp1);
8779 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8780 }
8781
8782 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8783 Register in_out,
8784 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8785 XMMRegister w_xtmp2,
8786 Register tmp1,
8787 Register n_tmp2, Register n_tmp3) {
8788 if (is_pclmulqdq_supported) {
8789 movdl(w_xtmp1, in_out); // modified blindly
8790
8791 movl(tmp1, const_or_pre_comp_const_index);
8792 movdl(w_xtmp2, tmp1);
8793 pclmulqdq(w_xtmp1, w_xtmp2, 0);
8794
8795 movdq(in_out, w_xtmp1);
8796 } else {
8797 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8798 }
8799 }
8800
8801 // Recombination Alternative 2: No bit-reflections
8802 // T1 = (CRC_A * U1) << 1
8803 // T2 = (CRC_B * U2) << 1
8804 // C1 = T1 >> 32
8805 // C2 = T2 >> 32
8806 // T1 = T1 & 0xFFFFFFFF
8807 // T2 = T2 & 0xFFFFFFFF
8808 // T1 = CRC32(0, T1)
8809 // T2 = CRC32(0, T2)
8810 // C1 = C1 ^ T1
8811 // C2 = C2 ^ T2
8812 // CRC = C1 ^ C2 ^ CRC_C
8813 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8814 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8815 Register tmp1, Register tmp2,
8816 Register n_tmp3) {
8817 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8818 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8819 shlq(in_out, 1);
8820 movl(tmp1, in_out);
8821 shrq(in_out, 32);
8822 xorl(tmp2, tmp2);
8823 crc32(tmp2, tmp1, 4);
8824 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8825 shlq(in1, 1);
8826 movl(tmp1, in1);
8827 shrq(in1, 32);
8828 xorl(tmp2, tmp2);
8829 crc32(tmp2, tmp1, 4);
8830 xorl(in1, tmp2);
8831 xorl(in_out, in1);
8832 xorl(in_out, in2);
8833 }
8834
8835 // Set N to predefined value
8836 // Subtract from a length of a buffer
8837 // execute in a loop:
8838 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8839 // for i = 1 to N do
8840 // CRC_A = CRC32(CRC_A, A[i])
8841 // CRC_B = CRC32(CRC_B, B[i])
8842 // CRC_C = CRC32(CRC_C, C[i])
8843 // end for
8844 // Recombine
8845 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8846 Register in_out1, Register in_out2, Register in_out3,
8847 Register tmp1, Register tmp2, Register tmp3,
8848 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8849 Register tmp4, Register tmp5,
8850 Register n_tmp6) {
8851 Label L_processPartitions;
8852 Label L_processPartition;
8853 Label L_exit;
8854
8855 bind(L_processPartitions);
8856 cmpl(in_out1, 3 * size);
8857 jcc(Assembler::less, L_exit);
8858 xorl(tmp1, tmp1);
8859 xorl(tmp2, tmp2);
8860 movq(tmp3, in_out2);
8861 addq(tmp3, size);
8862
8863 bind(L_processPartition);
8864 crc32(in_out3, Address(in_out2, 0), 8);
8865 crc32(tmp1, Address(in_out2, size), 8);
8866 crc32(tmp2, Address(in_out2, size * 2), 8);
8867 addq(in_out2, 8);
8868 cmpq(in_out2, tmp3);
8869 jcc(Assembler::less, L_processPartition);
8870 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8871 w_xtmp1, w_xtmp2, w_xtmp3,
8872 tmp4, tmp5,
8873 n_tmp6);
8874 addq(in_out2, 2 * size);
8875 subl(in_out1, 3 * size);
8876 jmp(L_processPartitions);
8877
8878 bind(L_exit);
8879 }
8880
8881 // Algorithm 2: Pipelined usage of the CRC32 instruction.
8882 // Input: A buffer I of L bytes.
8883 // Output: the CRC32C value of the buffer.
8884 // Notations:
8885 // Write L = 24N + r, with N = floor (L/24).
8886 // r = L mod 24 (0 <= r < 24).
8887 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8888 // N quadwords, and R consists of r bytes.
8889 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8890 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8891 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8892 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8893 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8894 Register tmp1, Register tmp2, Register tmp3,
8895 Register tmp4, Register tmp5, Register tmp6,
8896 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8897 bool is_pclmulqdq_supported) {
8898 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8899 Label L_wordByWord;
8900 Label L_byteByByteProlog;
8901 Label L_byteByByte;
8902 Label L_exit;
8903
8904 if (is_pclmulqdq_supported ) {
8905 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr();
8906 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1);
8907
8908 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2);
8909 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3);
8910
8911 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4);
8912 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5);
8913 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8914 } else {
8915 const_or_pre_comp_const_index[0] = 1;
8916 const_or_pre_comp_const_index[1] = 0;
8917
8918 const_or_pre_comp_const_index[2] = 3;
8919 const_or_pre_comp_const_index[3] = 2;
8920
8921 const_or_pre_comp_const_index[4] = 5;
8922 const_or_pre_comp_const_index[5] = 4;
8923 }
8924 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8925 in2, in1, in_out,
8926 tmp1, tmp2, tmp3,
8927 w_xtmp1, w_xtmp2, w_xtmp3,
8928 tmp4, tmp5,
8929 tmp6);
8930 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8931 in2, in1, in_out,
8932 tmp1, tmp2, tmp3,
8933 w_xtmp1, w_xtmp2, w_xtmp3,
8934 tmp4, tmp5,
8935 tmp6);
8936 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8937 in2, in1, in_out,
8938 tmp1, tmp2, tmp3,
8939 w_xtmp1, w_xtmp2, w_xtmp3,
8940 tmp4, tmp5,
8941 tmp6);
8942 movl(tmp1, in2);
8943 andl(tmp1, 0x00000007);
8944 negl(tmp1);
8945 addl(tmp1, in2);
8946 addq(tmp1, in1);
8947
8948 cmpq(in1, tmp1);
8949 jccb(Assembler::greaterEqual, L_byteByByteProlog);
8950 align(16);
8951 BIND(L_wordByWord);
8952 crc32(in_out, Address(in1, 0), 8);
8953 addq(in1, 8);
8954 cmpq(in1, tmp1);
8955 jcc(Assembler::less, L_wordByWord);
8956
8957 BIND(L_byteByByteProlog);
8958 andl(in2, 0x00000007);
8959 movl(tmp2, 1);
8960
8961 cmpl(tmp2, in2);
8962 jccb(Assembler::greater, L_exit);
8963 BIND(L_byteByByte);
8964 crc32(in_out, Address(in1, 0), 1);
8965 incq(in1);
8966 incl(tmp2);
8967 cmpl(tmp2, in2);
8968 jcc(Assembler::lessEqual, L_byteByByte);
8969
8970 BIND(L_exit);
8971 }
8972 #undef BIND
8973 #undef BLOCK_COMMENT
8974
8975 // Compress char[] array to byte[].
8976 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
8977 // Return the array length if every element in array can be encoded,
8978 // otherwise, the index of first non-latin1 (> 0xff) character.
8979 // @IntrinsicCandidate
8980 // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8981 // for (int i = 0; i < len; i++) {
8982 // char c = src[srcOff];
8983 // if (c > 0xff) {
8984 // return i; // return index of non-latin1 char
8985 // }
8986 // dst[dstOff] = (byte)c;
8987 // srcOff++;
8988 // dstOff++;
8989 // }
8990 // return len;
8991 // }
8992 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8993 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8994 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8995 Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8996 Label copy_chars_loop, done, reset_sp, copy_tail;
8997
8998 // rsi: src
8999 // rdi: dst
9000 // rdx: len
9001 // rcx: tmp5
9002 // rax: result
9003
9004 // rsi holds start addr of source char[] to be compressed
9005 // rdi holds start addr of destination byte[]
9006 // rdx holds length
9007
9008 assert(len != result, "");
9009
9010 // save length for return
9011 movl(result, len);
9012
9013 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
9014 VM_Version::supports_avx512vlbw() &&
9015 VM_Version::supports_bmi2()) {
9016
9017 Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
9018
9019 // alignment
9020 Label post_alignment;
9021
9022 // if length of the string is less than 32, handle it the old fashioned way
9023 testl(len, -32);
9024 jcc(Assembler::zero, below_threshold);
9025
9026 // First check whether a character is compressible ( <= 0xFF).
9027 // Create mask to test for Unicode chars inside zmm vector
9028 movl(tmp5, 0x00FF);
9029 evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
9030
9031 testl(len, -64);
9032 jccb(Assembler::zero, post_alignment);
9033
9034 movl(tmp5, dst);
9035 andl(tmp5, (32 - 1));
9036 negl(tmp5);
9037 andl(tmp5, (32 - 1));
9038
9039 // bail out when there is nothing to be done
9040 testl(tmp5, 0xFFFFFFFF);
9041 jccb(Assembler::zero, post_alignment);
9042
9043 // ~(~0 << len), where len is the # of remaining elements to process
9044 movl(len, 0xFFFFFFFF);
9045 shlxl(len, len, tmp5);
9046 notl(len);
9047 kmovdl(mask2, len);
9048 movl(len, result);
9049
9050 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
9051 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
9052 ktestd(mask1, mask2);
9053 jcc(Assembler::carryClear, copy_tail);
9054
9055 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
9056
9057 addptr(src, tmp5);
9058 addptr(src, tmp5);
9059 addptr(dst, tmp5);
9060 subl(len, tmp5);
9061
9062 bind(post_alignment);
9063 // end of alignment
9064
9065 movl(tmp5, len);
9066 andl(tmp5, (32 - 1)); // tail count (in chars)
9067 andl(len, ~(32 - 1)); // vector count (in chars)
9068 jccb(Assembler::zero, copy_loop_tail);
9069
9070 lea(src, Address(src, len, Address::times_2));
9071 lea(dst, Address(dst, len, Address::times_1));
9072 negptr(len);
9073
9074 bind(copy_32_loop);
9075 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
9076 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9077 kortestdl(mask1, mask1);
9078 jccb(Assembler::carryClear, reset_for_copy_tail);
9079
9080 // All elements in current processed chunk are valid candidates for
9081 // compression. Write a truncated byte elements to the memory.
9082 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
9083 addptr(len, 32);
9084 jccb(Assembler::notZero, copy_32_loop);
9085
9086 bind(copy_loop_tail);
9087 // bail out when there is nothing to be done
9088 testl(tmp5, 0xFFFFFFFF);
9089 jcc(Assembler::zero, done);
9090
9091 movl(len, tmp5);
9092
9093 // ~(~0 << len), where len is the # of remaining elements to process
9094 movl(tmp5, 0xFFFFFFFF);
9095 shlxl(tmp5, tmp5, len);
9096 notl(tmp5);
9097
9098 kmovdl(mask2, tmp5);
9099
9100 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
9101 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
9102 ktestd(mask1, mask2);
9103 jcc(Assembler::carryClear, copy_tail);
9104
9105 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
9106 jmp(done);
9107
9108 bind(reset_for_copy_tail);
9109 lea(src, Address(src, tmp5, Address::times_2));
9110 lea(dst, Address(dst, tmp5, Address::times_1));
9111 subptr(len, tmp5);
9112 jmp(copy_chars_loop);
9113
9114 bind(below_threshold);
9115 }
9116
9117 if (UseSSE42Intrinsics) {
9118 Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
9119
9120 // vectored compression
9121 testl(len, 0xfffffff8);
9122 jcc(Assembler::zero, copy_tail);
9123
9124 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
9125 movdl(tmp1Reg, tmp5);
9126 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
9127
9128 andl(len, 0xfffffff0);
9129 jccb(Assembler::zero, copy_16);
9130
9131 // compress 16 chars per iter
9132 pxor(tmp4Reg, tmp4Reg);
9133
9134 lea(src, Address(src, len, Address::times_2));
9135 lea(dst, Address(dst, len, Address::times_1));
9136 negptr(len);
9137
9138 bind(copy_32_loop);
9139 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
9140 por(tmp4Reg, tmp2Reg);
9141 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
9142 por(tmp4Reg, tmp3Reg);
9143 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
9144 jccb(Assembler::notZero, reset_for_copy_tail);
9145 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
9146 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
9147 addptr(len, 16);
9148 jccb(Assembler::notZero, copy_32_loop);
9149
9150 // compress next vector of 8 chars (if any)
9151 bind(copy_16);
9152 // len = 0
9153 testl(result, 0x00000008); // check if there's a block of 8 chars to compress
9154 jccb(Assembler::zero, copy_tail_sse);
9155
9156 pxor(tmp3Reg, tmp3Reg);
9157
9158 movdqu(tmp2Reg, Address(src, 0));
9159 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
9160 jccb(Assembler::notZero, reset_for_copy_tail);
9161 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
9162 movq(Address(dst, 0), tmp2Reg);
9163 addptr(src, 16);
9164 addptr(dst, 8);
9165 jmpb(copy_tail_sse);
9166
9167 bind(reset_for_copy_tail);
9168 movl(tmp5, result);
9169 andl(tmp5, 0x0000000f);
9170 lea(src, Address(src, tmp5, Address::times_2));
9171 lea(dst, Address(dst, tmp5, Address::times_1));
9172 subptr(len, tmp5);
9173 jmpb(copy_chars_loop);
9174
9175 bind(copy_tail_sse);
9176 movl(len, result);
9177 andl(len, 0x00000007); // tail count (in chars)
9178 }
9179 // compress 1 char per iter
9180 bind(copy_tail);
9181 testl(len, len);
9182 jccb(Assembler::zero, done);
9183 lea(src, Address(src, len, Address::times_2));
9184 lea(dst, Address(dst, len, Address::times_1));
9185 negptr(len);
9186
9187 bind(copy_chars_loop);
9188 load_unsigned_short(tmp5, Address(src, len, Address::times_2));
9189 testl(tmp5, 0xff00); // check if Unicode char
9190 jccb(Assembler::notZero, reset_sp);
9191 movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte
9192 increment(len);
9193 jccb(Assembler::notZero, copy_chars_loop);
9194
9195 // add len then return (len will be zero if compress succeeded, otherwise negative)
9196 bind(reset_sp);
9197 addl(result, len);
9198
9199 bind(done);
9200 }
9201
9202 // Inflate byte[] array to char[].
9203 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
9204 // @IntrinsicCandidate
9205 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
9206 // for (int i = 0; i < len; i++) {
9207 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
9208 // }
9209 // }
9210 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
9211 XMMRegister tmp1, Register tmp2, KRegister mask) {
9212 Label copy_chars_loop, done, below_threshold, avx3_threshold;
9213 // rsi: src
9214 // rdi: dst
9215 // rdx: len
9216 // rcx: tmp2
9217
9218 // rsi holds start addr of source byte[] to be inflated
9219 // rdi holds start addr of destination char[]
9220 // rdx holds length
9221 assert_different_registers(src, dst, len, tmp2);
9222 movl(tmp2, len);
9223 if ((UseAVX > 2) && // AVX512
9224 VM_Version::supports_avx512vlbw() &&
9225 VM_Version::supports_bmi2()) {
9226
9227 Label copy_32_loop, copy_tail;
9228 Register tmp3_aliased = len;
9229
9230 // if length of the string is less than 16, handle it in an old fashioned way
9231 testl(len, -16);
9232 jcc(Assembler::zero, below_threshold);
9233
9234 testl(len, -1 * AVX3Threshold);
9235 jcc(Assembler::zero, avx3_threshold);
9236
9237 // In order to use only one arithmetic operation for the main loop we use
9238 // this pre-calculation
9239 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
9240 andl(len, -32); // vector count
9241 jccb(Assembler::zero, copy_tail);
9242
9243 lea(src, Address(src, len, Address::times_1));
9244 lea(dst, Address(dst, len, Address::times_2));
9245 negptr(len);
9246
9247
9248 // inflate 32 chars per iter
9249 bind(copy_32_loop);
9250 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
9251 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
9252 addptr(len, 32);
9253 jcc(Assembler::notZero, copy_32_loop);
9254
9255 bind(copy_tail);
9256 // bail out when there is nothing to be done
9257 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
9258 jcc(Assembler::zero, done);
9259
9260 // ~(~0 << length), where length is the # of remaining elements to process
9261 movl(tmp3_aliased, -1);
9262 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
9263 notl(tmp3_aliased);
9264 kmovdl(mask, tmp3_aliased);
9265 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
9266 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
9267
9268 jmp(done);
9269 bind(avx3_threshold);
9270 }
9271 if (UseSSE42Intrinsics) {
9272 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
9273
9274 if (UseAVX > 1) {
9275 andl(tmp2, (16 - 1));
9276 andl(len, -16);
9277 jccb(Assembler::zero, copy_new_tail);
9278 } else {
9279 andl(tmp2, 0x00000007); // tail count (in chars)
9280 andl(len, 0xfffffff8); // vector count (in chars)
9281 jccb(Assembler::zero, copy_tail);
9282 }
9283
9284 // vectored inflation
9285 lea(src, Address(src, len, Address::times_1));
9286 lea(dst, Address(dst, len, Address::times_2));
9287 negptr(len);
9288
9289 if (UseAVX > 1) {
9290 bind(copy_16_loop);
9291 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
9292 vmovdqu(Address(dst, len, Address::times_2), tmp1);
9293 addptr(len, 16);
9294 jcc(Assembler::notZero, copy_16_loop);
9295
9296 bind(below_threshold);
9297 bind(copy_new_tail);
9298 movl(len, tmp2);
9299 andl(tmp2, 0x00000007);
9300 andl(len, 0xFFFFFFF8);
9301 jccb(Assembler::zero, copy_tail);
9302
9303 pmovzxbw(tmp1, Address(src, 0));
9304 movdqu(Address(dst, 0), tmp1);
9305 addptr(src, 8);
9306 addptr(dst, 2 * 8);
9307
9308 jmp(copy_tail, true);
9309 }
9310
9311 // inflate 8 chars per iter
9312 bind(copy_8_loop);
9313 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
9314 movdqu(Address(dst, len, Address::times_2), tmp1);
9315 addptr(len, 8);
9316 jcc(Assembler::notZero, copy_8_loop);
9317
9318 bind(copy_tail);
9319 movl(len, tmp2);
9320
9321 cmpl(len, 4);
9322 jccb(Assembler::less, copy_bytes);
9323
9324 movdl(tmp1, Address(src, 0)); // load 4 byte chars
9325 pmovzxbw(tmp1, tmp1);
9326 movq(Address(dst, 0), tmp1);
9327 subptr(len, 4);
9328 addptr(src, 4);
9329 addptr(dst, 8);
9330
9331 bind(copy_bytes);
9332 } else {
9333 bind(below_threshold);
9334 }
9335
9336 testl(len, len);
9337 jccb(Assembler::zero, done);
9338 lea(src, Address(src, len, Address::times_1));
9339 lea(dst, Address(dst, len, Address::times_2));
9340 negptr(len);
9341
9342 // inflate 1 char per iter
9343 bind(copy_chars_loop);
9344 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
9345 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
9346 increment(len);
9347 jcc(Assembler::notZero, copy_chars_loop);
9348
9349 bind(done);
9350 }
9351
9352 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
9353 switch(type) {
9354 case T_BYTE:
9355 case T_BOOLEAN:
9356 evmovdqub(dst, kmask, src, merge, vector_len);
9357 break;
9358 case T_CHAR:
9359 case T_SHORT:
9360 evmovdquw(dst, kmask, src, merge, vector_len);
9361 break;
9362 case T_INT:
9363 case T_FLOAT:
9364 evmovdqul(dst, kmask, src, merge, vector_len);
9365 break;
9366 case T_LONG:
9367 case T_DOUBLE:
9368 evmovdquq(dst, kmask, src, merge, vector_len);
9369 break;
9370 default:
9371 fatal("Unexpected type argument %s", type2name(type));
9372 break;
9373 }
9374 }
9375
9376
9377 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
9378 switch(type) {
9379 case T_BYTE:
9380 case T_BOOLEAN:
9381 evmovdqub(dst, kmask, src, merge, vector_len);
9382 break;
9383 case T_CHAR:
9384 case T_SHORT:
9385 evmovdquw(dst, kmask, src, merge, vector_len);
9386 break;
9387 case T_INT:
9388 case T_FLOAT:
9389 evmovdqul(dst, kmask, src, merge, vector_len);
9390 break;
9391 case T_LONG:
9392 case T_DOUBLE:
9393 evmovdquq(dst, kmask, src, merge, vector_len);
9394 break;
9395 default:
9396 fatal("Unexpected type argument %s", type2name(type));
9397 break;
9398 }
9399 }
9400
9401 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
9402 switch(type) {
9403 case T_BYTE:
9404 case T_BOOLEAN:
9405 evmovdqub(dst, kmask, src, merge, vector_len);
9406 break;
9407 case T_CHAR:
9408 case T_SHORT:
9409 evmovdquw(dst, kmask, src, merge, vector_len);
9410 break;
9411 case T_INT:
9412 case T_FLOAT:
9413 evmovdqul(dst, kmask, src, merge, vector_len);
9414 break;
9415 case T_LONG:
9416 case T_DOUBLE:
9417 evmovdquq(dst, kmask, src, merge, vector_len);
9418 break;
9419 default:
9420 fatal("Unexpected type argument %s", type2name(type));
9421 break;
9422 }
9423 }
9424
9425 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
9426 switch(masklen) {
9427 case 2:
9428 knotbl(dst, src);
9429 movl(rtmp, 3);
9430 kmovbl(ktmp, rtmp);
9431 kandbl(dst, ktmp, dst);
9432 break;
9433 case 4:
9434 knotbl(dst, src);
9435 movl(rtmp, 15);
9436 kmovbl(ktmp, rtmp);
9437 kandbl(dst, ktmp, dst);
9438 break;
9439 case 8:
9440 knotbl(dst, src);
9441 break;
9442 case 16:
9443 knotwl(dst, src);
9444 break;
9445 case 32:
9446 knotdl(dst, src);
9447 break;
9448 case 64:
9449 knotql(dst, src);
9450 break;
9451 default:
9452 fatal("Unexpected vector length %d", masklen);
9453 break;
9454 }
9455 }
9456
9457 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9458 switch(type) {
9459 case T_BOOLEAN:
9460 case T_BYTE:
9461 kandbl(dst, src1, src2);
9462 break;
9463 case T_CHAR:
9464 case T_SHORT:
9465 kandwl(dst, src1, src2);
9466 break;
9467 case T_INT:
9468 case T_FLOAT:
9469 kanddl(dst, src1, src2);
9470 break;
9471 case T_LONG:
9472 case T_DOUBLE:
9473 kandql(dst, src1, src2);
9474 break;
9475 default:
9476 fatal("Unexpected type argument %s", type2name(type));
9477 break;
9478 }
9479 }
9480
9481 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9482 switch(type) {
9483 case T_BOOLEAN:
9484 case T_BYTE:
9485 korbl(dst, src1, src2);
9486 break;
9487 case T_CHAR:
9488 case T_SHORT:
9489 korwl(dst, src1, src2);
9490 break;
9491 case T_INT:
9492 case T_FLOAT:
9493 kordl(dst, src1, src2);
9494 break;
9495 case T_LONG:
9496 case T_DOUBLE:
9497 korql(dst, src1, src2);
9498 break;
9499 default:
9500 fatal("Unexpected type argument %s", type2name(type));
9501 break;
9502 }
9503 }
9504
9505 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9506 switch(type) {
9507 case T_BOOLEAN:
9508 case T_BYTE:
9509 kxorbl(dst, src1, src2);
9510 break;
9511 case T_CHAR:
9512 case T_SHORT:
9513 kxorwl(dst, src1, src2);
9514 break;
9515 case T_INT:
9516 case T_FLOAT:
9517 kxordl(dst, src1, src2);
9518 break;
9519 case T_LONG:
9520 case T_DOUBLE:
9521 kxorql(dst, src1, src2);
9522 break;
9523 default:
9524 fatal("Unexpected type argument %s", type2name(type));
9525 break;
9526 }
9527 }
9528
9529 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9530 switch(type) {
9531 case T_BOOLEAN:
9532 case T_BYTE:
9533 evpermb(dst, mask, nds, src, merge, vector_len); break;
9534 case T_CHAR:
9535 case T_SHORT:
9536 evpermw(dst, mask, nds, src, merge, vector_len); break;
9537 case T_INT:
9538 case T_FLOAT:
9539 evpermd(dst, mask, nds, src, merge, vector_len); break;
9540 case T_LONG:
9541 case T_DOUBLE:
9542 evpermq(dst, mask, nds, src, merge, vector_len); break;
9543 default:
9544 fatal("Unexpected type argument %s", type2name(type)); break;
9545 }
9546 }
9547
9548 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9549 switch(type) {
9550 case T_BOOLEAN:
9551 case T_BYTE:
9552 evpermb(dst, mask, nds, src, merge, vector_len); break;
9553 case T_CHAR:
9554 case T_SHORT:
9555 evpermw(dst, mask, nds, src, merge, vector_len); break;
9556 case T_INT:
9557 case T_FLOAT:
9558 evpermd(dst, mask, nds, src, merge, vector_len); break;
9559 case T_LONG:
9560 case T_DOUBLE:
9561 evpermq(dst, mask, nds, src, merge, vector_len); break;
9562 default:
9563 fatal("Unexpected type argument %s", type2name(type)); break;
9564 }
9565 }
9566
9567 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9568 switch(type) {
9569 case T_BYTE:
9570 evpminub(dst, mask, nds, src, merge, vector_len); break;
9571 case T_SHORT:
9572 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9573 case T_INT:
9574 evpminud(dst, mask, nds, src, merge, vector_len); break;
9575 case T_LONG:
9576 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9577 default:
9578 fatal("Unexpected type argument %s", type2name(type)); break;
9579 }
9580 }
9581
9582 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9583 switch(type) {
9584 case T_BYTE:
9585 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9586 case T_SHORT:
9587 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9588 case T_INT:
9589 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9590 case T_LONG:
9591 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9592 default:
9593 fatal("Unexpected type argument %s", type2name(type)); break;
9594 }
9595 }
9596
9597 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9598 switch(type) {
9599 case T_BYTE:
9600 evpminub(dst, mask, nds, src, merge, vector_len); break;
9601 case T_SHORT:
9602 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9603 case T_INT:
9604 evpminud(dst, mask, nds, src, merge, vector_len); break;
9605 case T_LONG:
9606 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9607 default:
9608 fatal("Unexpected type argument %s", type2name(type)); break;
9609 }
9610 }
9611
9612 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9613 switch(type) {
9614 case T_BYTE:
9615 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9616 case T_SHORT:
9617 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9618 case T_INT:
9619 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9620 case T_LONG:
9621 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9622 default:
9623 fatal("Unexpected type argument %s", type2name(type)); break;
9624 }
9625 }
9626
9627 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9628 switch(type) {
9629 case T_BYTE:
9630 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9631 case T_SHORT:
9632 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9633 case T_INT:
9634 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9635 case T_LONG:
9636 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9637 case T_FLOAT:
9638 evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9639 case T_DOUBLE:
9640 evminmaxpd(dst, mask, nds, src, merge, AVX10_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9641 default:
9642 fatal("Unexpected type argument %s", type2name(type)); break;
9643 }
9644 }
9645
9646 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9647 switch(type) {
9648 case T_BYTE:
9649 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9650 case T_SHORT:
9651 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9652 case T_INT:
9653 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9654 case T_LONG:
9655 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9656 case T_FLOAT:
9657 evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9658 case T_DOUBLE:
9659 evminmaxpd(dst, mask, nds, src, merge, AVX10_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9660 default:
9661 fatal("Unexpected type argument %s", type2name(type)); break;
9662 }
9663 }
9664
9665 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9666 switch(type) {
9667 case T_BYTE:
9668 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9669 case T_SHORT:
9670 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9671 case T_INT:
9672 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9673 case T_LONG:
9674 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9675 case T_FLOAT:
9676 evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9677 case T_DOUBLE:
9678 evminmaxpd(dst, mask, nds, src, merge, AVX10_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9679 default:
9680 fatal("Unexpected type argument %s", type2name(type)); break;
9681 }
9682 }
9683
9684 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9685 switch(type) {
9686 case T_BYTE:
9687 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9688 case T_SHORT:
9689 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9690 case T_INT:
9691 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9692 case T_LONG:
9693 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9694 case T_FLOAT:
9695 evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9696 case T_DOUBLE:
9697 evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9698 default:
9699 fatal("Unexpected type argument %s", type2name(type)); break;
9700 }
9701 }
9702
9703 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9704 switch(type) {
9705 case T_INT:
9706 evpxord(dst, mask, nds, src, merge, vector_len); break;
9707 case T_LONG:
9708 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9709 default:
9710 fatal("Unexpected type argument %s", type2name(type)); break;
9711 }
9712 }
9713
9714 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9715 switch(type) {
9716 case T_INT:
9717 evpxord(dst, mask, nds, src, merge, vector_len); break;
9718 case T_LONG:
9719 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9720 default:
9721 fatal("Unexpected type argument %s", type2name(type)); break;
9722 }
9723 }
9724
9725 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9726 switch(type) {
9727 case T_INT:
9728 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9729 case T_LONG:
9730 evporq(dst, mask, nds, src, merge, vector_len); break;
9731 default:
9732 fatal("Unexpected type argument %s", type2name(type)); break;
9733 }
9734 }
9735
9736 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9737 switch(type) {
9738 case T_INT:
9739 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9740 case T_LONG:
9741 evporq(dst, mask, nds, src, merge, vector_len); break;
9742 default:
9743 fatal("Unexpected type argument %s", type2name(type)); break;
9744 }
9745 }
9746
9747 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9748 switch(type) {
9749 case T_INT:
9750 evpandd(dst, mask, nds, src, merge, vector_len); break;
9751 case T_LONG:
9752 evpandq(dst, mask, nds, src, merge, vector_len); break;
9753 default:
9754 fatal("Unexpected type argument %s", type2name(type)); break;
9755 }
9756 }
9757
9758 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9759 switch(type) {
9760 case T_INT:
9761 evpandd(dst, mask, nds, src, merge, vector_len); break;
9762 case T_LONG:
9763 evpandq(dst, mask, nds, src, merge, vector_len); break;
9764 default:
9765 fatal("Unexpected type argument %s", type2name(type)); break;
9766 }
9767 }
9768
9769 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
9770 switch(masklen) {
9771 case 8:
9772 kortestbl(src1, src2);
9773 break;
9774 case 16:
9775 kortestwl(src1, src2);
9776 break;
9777 case 32:
9778 kortestdl(src1, src2);
9779 break;
9780 case 64:
9781 kortestql(src1, src2);
9782 break;
9783 default:
9784 fatal("Unexpected mask length %d", masklen);
9785 break;
9786 }
9787 }
9788
9789
9790 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9791 switch(masklen) {
9792 case 8:
9793 ktestbl(src1, src2);
9794 break;
9795 case 16:
9796 ktestwl(src1, src2);
9797 break;
9798 case 32:
9799 ktestdl(src1, src2);
9800 break;
9801 case 64:
9802 ktestql(src1, src2);
9803 break;
9804 default:
9805 fatal("Unexpected mask length %d", masklen);
9806 break;
9807 }
9808 }
9809
9810 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9811 switch(type) {
9812 case T_INT:
9813 evprold(dst, mask, src, shift, merge, vlen_enc); break;
9814 case T_LONG:
9815 evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9816 default:
9817 fatal("Unexpected type argument %s", type2name(type)); break;
9818 break;
9819 }
9820 }
9821
9822 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9823 switch(type) {
9824 case T_INT:
9825 evprord(dst, mask, src, shift, merge, vlen_enc); break;
9826 case T_LONG:
9827 evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9828 default:
9829 fatal("Unexpected type argument %s", type2name(type)); break;
9830 }
9831 }
9832
9833 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9834 switch(type) {
9835 case T_INT:
9836 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9837 case T_LONG:
9838 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9839 default:
9840 fatal("Unexpected type argument %s", type2name(type)); break;
9841 }
9842 }
9843
9844 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9845 switch(type) {
9846 case T_INT:
9847 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9848 case T_LONG:
9849 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9850 default:
9851 fatal("Unexpected type argument %s", type2name(type)); break;
9852 }
9853 }
9854
9855 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9856 assert(rscratch != noreg || always_reachable(src), "missing");
9857
9858 if (reachable(src)) {
9859 evpandq(dst, nds, as_Address(src), vector_len);
9860 } else {
9861 lea(rscratch, src);
9862 evpandq(dst, nds, Address(rscratch, 0), vector_len);
9863 }
9864 }
9865
9866 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9867 assert(rscratch != noreg || always_reachable(src), "missing");
9868
9869 if (reachable(src)) {
9870 Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9871 } else {
9872 lea(rscratch, src);
9873 Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9874 }
9875 }
9876
9877 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9878 assert(rscratch != noreg || always_reachable(src), "missing");
9879
9880 if (reachable(src)) {
9881 evporq(dst, nds, as_Address(src), vector_len);
9882 } else {
9883 lea(rscratch, src);
9884 evporq(dst, nds, Address(rscratch, 0), vector_len);
9885 }
9886 }
9887
9888 void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9889 assert(rscratch != noreg || always_reachable(src), "missing");
9890
9891 if (reachable(src)) {
9892 vpshufb(dst, nds, as_Address(src), vector_len);
9893 } else {
9894 lea(rscratch, src);
9895 vpshufb(dst, nds, Address(rscratch, 0), vector_len);
9896 }
9897 }
9898
9899 void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9900 assert(rscratch != noreg || always_reachable(src), "missing");
9901
9902 if (reachable(src)) {
9903 Assembler::vpor(dst, nds, as_Address(src), vector_len);
9904 } else {
9905 lea(rscratch, src);
9906 Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len);
9907 }
9908 }
9909
9910 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
9911 assert(rscratch != noreg || always_reachable(src3), "missing");
9912
9913 if (reachable(src3)) {
9914 vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
9915 } else {
9916 lea(rscratch, src3);
9917 vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
9918 }
9919 }
9920
9921 #if COMPILER2_OR_JVMCI
9922
9923 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9924 Register length, Register temp, int vec_enc) {
9925 // Computing mask for predicated vector store.
9926 movptr(temp, -1);
9927 bzhiq(temp, temp, length);
9928 kmov(mask, temp);
9929 evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9930 }
9931
9932 // Set memory operation for length "less than" 64 bytes.
9933 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9934 XMMRegister xmm, KRegister mask, Register length,
9935 Register temp, bool use64byteVector) {
9936 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9937 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9938 if (!use64byteVector) {
9939 fill32(dst, disp, xmm);
9940 subptr(length, 32 >> shift);
9941 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9942 } else {
9943 assert(MaxVectorSize == 64, "vector length != 64");
9944 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9945 }
9946 }
9947
9948
9949 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9950 XMMRegister xmm, KRegister mask, Register length,
9951 Register temp) {
9952 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9953 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9954 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9955 }
9956
9957
9958 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9959 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9960 vmovdqu(dst, xmm);
9961 }
9962
9963 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9964 fill32(Address(dst, disp), xmm);
9965 }
9966
9967 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9968 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9969 if (!use64byteVector) {
9970 fill32(dst, xmm);
9971 fill32(dst.plus_disp(32), xmm);
9972 } else {
9973 evmovdquq(dst, xmm, Assembler::AVX_512bit);
9974 }
9975 }
9976
9977 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9978 fill64(Address(dst, disp), xmm, use64byteVector);
9979 }
9980
9981 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9982 Register count, Register rtmp, XMMRegister xtmp) {
9983 Label L_exit;
9984 Label L_fill_start;
9985 Label L_fill_64_bytes;
9986 Label L_fill_96_bytes;
9987 Label L_fill_128_bytes;
9988 Label L_fill_128_bytes_loop;
9989 Label L_fill_128_loop_header;
9990 Label L_fill_128_bytes_loop_header;
9991 Label L_fill_128_bytes_loop_pre_header;
9992 Label L_fill_zmm_sequence;
9993
9994 int shift = -1;
9995 int avx3threshold = VM_Version::avx3_threshold();
9996 switch(type) {
9997 case T_BYTE: shift = 0;
9998 break;
9999 case T_SHORT: shift = 1;
10000 break;
10001 case T_INT: shift = 2;
10002 break;
10003 /* Uncomment when LONG fill stubs are supported.
10004 case T_LONG: shift = 3;
10005 break;
10006 */
10007 default:
10008 fatal("Unhandled type: %s\n", type2name(type));
10009 }
10010
10011 if ((avx3threshold != 0) || (MaxVectorSize == 32)) {
10012
10013 if (MaxVectorSize == 64) {
10014 cmpq(count, avx3threshold >> shift);
10015 jcc(Assembler::greater, L_fill_zmm_sequence);
10016 }
10017
10018 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
10019
10020 bind(L_fill_start);
10021
10022 cmpq(count, 32 >> shift);
10023 jccb(Assembler::greater, L_fill_64_bytes);
10024 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
10025 jmp(L_exit);
10026
10027 bind(L_fill_64_bytes);
10028 cmpq(count, 64 >> shift);
10029 jccb(Assembler::greater, L_fill_96_bytes);
10030 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
10031 jmp(L_exit);
10032
10033 bind(L_fill_96_bytes);
10034 cmpq(count, 96 >> shift);
10035 jccb(Assembler::greater, L_fill_128_bytes);
10036 fill64(to, 0, xtmp);
10037 subq(count, 64 >> shift);
10038 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
10039 jmp(L_exit);
10040
10041 bind(L_fill_128_bytes);
10042 cmpq(count, 128 >> shift);
10043 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
10044 fill64(to, 0, xtmp);
10045 fill32(to, 64, xtmp);
10046 subq(count, 96 >> shift);
10047 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
10048 jmp(L_exit);
10049
10050 bind(L_fill_128_bytes_loop_pre_header);
10051 {
10052 mov(rtmp, to);
10053 andq(rtmp, 31);
10054 jccb(Assembler::zero, L_fill_128_bytes_loop_header);
10055 negq(rtmp);
10056 addq(rtmp, 32);
10057 mov64(r8, -1L);
10058 bzhiq(r8, r8, rtmp);
10059 kmovql(k2, r8);
10060 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
10061 addq(to, rtmp);
10062 shrq(rtmp, shift);
10063 subq(count, rtmp);
10064 }
10065
10066 cmpq(count, 128 >> shift);
10067 jcc(Assembler::less, L_fill_start);
10068
10069 bind(L_fill_128_bytes_loop_header);
10070 subq(count, 128 >> shift);
10071
10072 align32();
10073 bind(L_fill_128_bytes_loop);
10074 fill64(to, 0, xtmp);
10075 fill64(to, 64, xtmp);
10076 addq(to, 128);
10077 subq(count, 128 >> shift);
10078 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
10079
10080 addq(count, 128 >> shift);
10081 jcc(Assembler::zero, L_exit);
10082 jmp(L_fill_start);
10083 }
10084
10085 if (MaxVectorSize == 64) {
10086 // Sequence using 64 byte ZMM register.
10087 Label L_fill_128_bytes_zmm;
10088 Label L_fill_192_bytes_zmm;
10089 Label L_fill_192_bytes_loop_zmm;
10090 Label L_fill_192_bytes_loop_header_zmm;
10091 Label L_fill_192_bytes_loop_pre_header_zmm;
10092 Label L_fill_start_zmm_sequence;
10093
10094 bind(L_fill_zmm_sequence);
10095 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
10096
10097 bind(L_fill_start_zmm_sequence);
10098 cmpq(count, 64 >> shift);
10099 jccb(Assembler::greater, L_fill_128_bytes_zmm);
10100 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
10101 jmp(L_exit);
10102
10103 bind(L_fill_128_bytes_zmm);
10104 cmpq(count, 128 >> shift);
10105 jccb(Assembler::greater, L_fill_192_bytes_zmm);
10106 fill64(to, 0, xtmp, true);
10107 subq(count, 64 >> shift);
10108 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
10109 jmp(L_exit);
10110
10111 bind(L_fill_192_bytes_zmm);
10112 cmpq(count, 192 >> shift);
10113 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
10114 fill64(to, 0, xtmp, true);
10115 fill64(to, 64, xtmp, true);
10116 subq(count, 128 >> shift);
10117 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
10118 jmp(L_exit);
10119
10120 bind(L_fill_192_bytes_loop_pre_header_zmm);
10121 {
10122 movq(rtmp, to);
10123 andq(rtmp, 63);
10124 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
10125 negq(rtmp);
10126 addq(rtmp, 64);
10127 mov64(r8, -1L);
10128 bzhiq(r8, r8, rtmp);
10129 kmovql(k2, r8);
10130 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
10131 addq(to, rtmp);
10132 shrq(rtmp, shift);
10133 subq(count, rtmp);
10134 }
10135
10136 cmpq(count, 192 >> shift);
10137 jcc(Assembler::less, L_fill_start_zmm_sequence);
10138
10139 bind(L_fill_192_bytes_loop_header_zmm);
10140 subq(count, 192 >> shift);
10141
10142 align32();
10143 bind(L_fill_192_bytes_loop_zmm);
10144 fill64(to, 0, xtmp, true);
10145 fill64(to, 64, xtmp, true);
10146 fill64(to, 128, xtmp, true);
10147 addq(to, 192);
10148 subq(count, 192 >> shift);
10149 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
10150
10151 addq(count, 192 >> shift);
10152 jcc(Assembler::zero, L_exit);
10153 jmp(L_fill_start_zmm_sequence);
10154 }
10155 bind(L_exit);
10156 }
10157 #endif //COMPILER2_OR_JVMCI
10158
10159
10160 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
10161 Label done;
10162 cvttss2sil(dst, src);
10163 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
10164 cmpl(dst, 0x80000000); // float_sign_flip
10165 jccb(Assembler::notEqual, done);
10166 subptr(rsp, 8);
10167 movflt(Address(rsp, 0), src);
10168 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
10169 pop(dst);
10170 bind(done);
10171 }
10172
10173 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
10174 Label done;
10175 cvttsd2sil(dst, src);
10176 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
10177 cmpl(dst, 0x80000000); // float_sign_flip
10178 jccb(Assembler::notEqual, done);
10179 subptr(rsp, 8);
10180 movdbl(Address(rsp, 0), src);
10181 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
10182 pop(dst);
10183 bind(done);
10184 }
10185
10186 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
10187 Label done;
10188 cvttss2siq(dst, src);
10189 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
10190 jccb(Assembler::notEqual, done);
10191 subptr(rsp, 8);
10192 movflt(Address(rsp, 0), src);
10193 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
10194 pop(dst);
10195 bind(done);
10196 }
10197
10198 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
10199 // Following code is line by line assembly translation rounding algorithm.
10200 // Please refer to java.lang.Math.round(float) algorithm for details.
10201 const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
10202 const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
10203 const int32_t FloatConsts_EXP_BIAS = 127;
10204 const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
10205 const int32_t MINUS_32 = 0xFFFFFFE0;
10206 Label L_special_case, L_block1, L_exit;
10207 movl(rtmp, FloatConsts_EXP_BIT_MASK);
10208 movdl(dst, src);
10209 andl(dst, rtmp);
10210 sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
10211 movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
10212 subl(rtmp, dst);
10213 movl(rcx, rtmp);
10214 movl(dst, MINUS_32);
10215 testl(rtmp, dst);
10216 jccb(Assembler::notEqual, L_special_case);
10217 movdl(dst, src);
10218 andl(dst, FloatConsts_SIGNIF_BIT_MASK);
10219 orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
10220 movdl(rtmp, src);
10221 testl(rtmp, rtmp);
10222 jccb(Assembler::greaterEqual, L_block1);
10223 negl(dst);
10224 bind(L_block1);
10225 sarl(dst);
10226 addl(dst, 0x1);
10227 sarl(dst, 0x1);
10228 jmp(L_exit);
10229 bind(L_special_case);
10230 convert_f2i(dst, src);
10231 bind(L_exit);
10232 }
10233
10234 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
10235 // Following code is line by line assembly translation rounding algorithm.
10236 // Please refer to java.lang.Math.round(double) algorithm for details.
10237 const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
10238 const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
10239 const int64_t DoubleConsts_EXP_BIAS = 1023;
10240 const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
10241 const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
10242 Label L_special_case, L_block1, L_exit;
10243 mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
10244 movq(dst, src);
10245 andq(dst, rtmp);
10246 sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
10247 mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
10248 subq(rtmp, dst);
10249 movq(rcx, rtmp);
10250 mov64(dst, MINUS_64);
10251 testq(rtmp, dst);
10252 jccb(Assembler::notEqual, L_special_case);
10253 movq(dst, src);
10254 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
10255 andq(dst, rtmp);
10256 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
10257 orq(dst, rtmp);
10258 movq(rtmp, src);
10259 testq(rtmp, rtmp);
10260 jccb(Assembler::greaterEqual, L_block1);
10261 negq(dst);
10262 bind(L_block1);
10263 sarq(dst);
10264 addq(dst, 0x1);
10265 sarq(dst, 0x1);
10266 jmp(L_exit);
10267 bind(L_special_case);
10268 convert_d2l(dst, src);
10269 bind(L_exit);
10270 }
10271
10272 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
10273 Label done;
10274 cvttsd2siq(dst, src);
10275 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
10276 jccb(Assembler::notEqual, done);
10277 subptr(rsp, 8);
10278 movdbl(Address(rsp, 0), src);
10279 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
10280 pop(dst);
10281 bind(done);
10282 }
10283
10284 void MacroAssembler::cache_wb(Address line)
10285 {
10286 // 64 bit cpus always support clflush
10287 assert(VM_Version::supports_clflush(), "clflush should be available");
10288 bool optimized = VM_Version::supports_clflushopt();
10289 bool no_evict = VM_Version::supports_clwb();
10290
10291 // prefer clwb (writeback without evict) otherwise
10292 // prefer clflushopt (potentially parallel writeback with evict)
10293 // otherwise fallback on clflush (serial writeback with evict)
10294
10295 if (optimized) {
10296 if (no_evict) {
10297 clwb(line);
10298 } else {
10299 clflushopt(line);
10300 }
10301 } else {
10302 // no need for fence when using CLFLUSH
10303 clflush(line);
10304 }
10305 }
10306
10307 void MacroAssembler::cache_wbsync(bool is_pre)
10308 {
10309 assert(VM_Version::supports_clflush(), "clflush should be available");
10310 bool optimized = VM_Version::supports_clflushopt();
10311 bool no_evict = VM_Version::supports_clwb();
10312
10313 // pick the correct implementation
10314
10315 if (!is_pre && (optimized || no_evict)) {
10316 // need an sfence for post flush when using clflushopt or clwb
10317 // otherwise no no need for any synchroniaztion
10318
10319 sfence();
10320 }
10321 }
10322
10323 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10324 switch (cond) {
10325 // Note some conditions are synonyms for others
10326 case Assembler::zero: return Assembler::notZero;
10327 case Assembler::notZero: return Assembler::zero;
10328 case Assembler::less: return Assembler::greaterEqual;
10329 case Assembler::lessEqual: return Assembler::greater;
10330 case Assembler::greater: return Assembler::lessEqual;
10331 case Assembler::greaterEqual: return Assembler::less;
10332 case Assembler::below: return Assembler::aboveEqual;
10333 case Assembler::belowEqual: return Assembler::above;
10334 case Assembler::above: return Assembler::belowEqual;
10335 case Assembler::aboveEqual: return Assembler::below;
10336 case Assembler::overflow: return Assembler::noOverflow;
10337 case Assembler::noOverflow: return Assembler::overflow;
10338 case Assembler::negative: return Assembler::positive;
10339 case Assembler::positive: return Assembler::negative;
10340 case Assembler::parity: return Assembler::noParity;
10341 case Assembler::noParity: return Assembler::parity;
10342 }
10343 ShouldNotReachHere(); return Assembler::overflow;
10344 }
10345
10346 // This is simply a call to Thread::current()
10347 void MacroAssembler::get_thread_slow(Register thread) {
10348 if (thread != rax) {
10349 push(rax);
10350 }
10351 push(rdi);
10352 push(rsi);
10353 push(rdx);
10354 push(rcx);
10355 push(r8);
10356 push(r9);
10357 push(r10);
10358 push(r11);
10359
10360 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10361
10362 pop(r11);
10363 pop(r10);
10364 pop(r9);
10365 pop(r8);
10366 pop(rcx);
10367 pop(rdx);
10368 pop(rsi);
10369 pop(rdi);
10370 if (thread != rax) {
10371 mov(thread, rax);
10372 pop(rax);
10373 }
10374 }
10375
10376 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
10377 Label L_stack_ok;
10378 if (bias == 0) {
10379 testptr(sp, 2 * wordSize - 1);
10380 } else {
10381 // lea(tmp, Address(rsp, bias);
10382 mov(tmp, sp);
10383 addptr(tmp, bias);
10384 testptr(tmp, 2 * wordSize - 1);
10385 }
10386 jcc(Assembler::equal, L_stack_ok);
10387 block_comment(msg);
10388 stop(msg);
10389 bind(L_stack_ok);
10390 }
10391
10392 // Implements lightweight-locking.
10393 //
10394 // obj: the object to be locked
10395 // reg_rax: rax
10396 // thread: the thread which attempts to lock obj
10397 // tmp: a temporary register
10398 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register reg_rax, Register tmp, Label& slow) {
10399 Register thread = r15_thread;
10400
10401 assert(reg_rax == rax, "");
10402 assert_different_registers(basic_lock, obj, reg_rax, thread, tmp);
10403
10404 Label push;
10405 const Register top = tmp;
10406
10407 // Preload the markWord. It is important that this is the first
10408 // instruction emitted as it is part of C1's null check semantics.
10409 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10410
10411 if (UseObjectMonitorTable) {
10412 // Clear cache in case fast locking succeeds or we need to take the slow-path.
10413 movptr(Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))), 0);
10414 }
10415
10416 if (DiagnoseSyncOnValueBasedClasses != 0) {
10417 load_klass(tmp, obj, rscratch1);
10418 testb(Address(tmp, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
10419 jcc(Assembler::notZero, slow);
10420 }
10421
10422 // Load top.
10423 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10424
10425 // Check if the lock-stack is full.
10426 cmpl(top, LockStack::end_offset());
10427 jcc(Assembler::greaterEqual, slow);
10428
10429 // Check for recursion.
10430 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
10431 jcc(Assembler::equal, push);
10432
10433 // Check header for monitor (0b10).
10434 testptr(reg_rax, markWord::monitor_value);
10435 jcc(Assembler::notZero, slow);
10436
10437 // Try to lock. Transition lock bits 0b01 => 0b00
10438 movptr(tmp, reg_rax);
10439 andptr(tmp, ~(int32_t)markWord::unlocked_value);
10440 orptr(reg_rax, markWord::unlocked_value);
10441 // Mask inline_type bit such that we go to the slow path if object is an inline type
10442 andptr(reg_rax, ~((int) markWord::inline_type_bit_in_place));
10443
10444 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10445 jcc(Assembler::notEqual, slow);
10446
10447 // Restore top, CAS clobbers register.
10448 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10449
10450 bind(push);
10451 // After successful lock, push object on lock-stack.
10452 movptr(Address(thread, top), obj);
10453 incrementl(top, oopSize);
10454 movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
10455 }
10456
10457 // Implements lightweight-unlocking.
10458 //
10459 // obj: the object to be unlocked
10460 // reg_rax: rax
10461 // thread: the thread
10462 // tmp: a temporary register
10463 void MacroAssembler::lightweight_unlock(Register obj, Register reg_rax, Register tmp, Label& slow) {
10464 Register thread = r15_thread;
10465
10466 assert(reg_rax == rax, "");
10467 assert_different_registers(obj, reg_rax, thread, tmp);
10468
10469 Label unlocked, push_and_slow;
10470 const Register top = tmp;
10471
10472 // Check if obj is top of lock-stack.
10473 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10474 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
10475 jcc(Assembler::notEqual, slow);
10476
10477 // Pop lock-stack.
10478 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
10479 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10480
10481 // Check if recursive.
10482 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
10483 jcc(Assembler::equal, unlocked);
10484
10485 // Not recursive. Check header for monitor (0b10).
10486 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10487 testptr(reg_rax, markWord::monitor_value);
10488 jcc(Assembler::notZero, push_and_slow);
10489
10490 #ifdef ASSERT
10491 // Check header not unlocked (0b01).
10492 Label not_unlocked;
10493 testptr(reg_rax, markWord::unlocked_value);
10494 jcc(Assembler::zero, not_unlocked);
10495 stop("lightweight_unlock already unlocked");
10496 bind(not_unlocked);
10497 #endif
10498
10499 // Try to unlock. Transition lock bits 0b00 => 0b01
10500 movptr(tmp, reg_rax);
10501 orptr(tmp, markWord::unlocked_value);
10502 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10503 jcc(Assembler::equal, unlocked);
10504
10505 bind(push_and_slow);
10506 // Restore lock-stack and handle the unlock in runtime.
10507 #ifdef ASSERT
10508 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10509 movptr(Address(thread, top), obj);
10510 #endif
10511 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10512 jmp(slow);
10513
10514 bind(unlocked);
10515 }
10516
10517 // Saves legacy GPRs state on stack.
10518 void MacroAssembler::save_legacy_gprs() {
10519 subq(rsp, 16 * wordSize);
10520 movq(Address(rsp, 15 * wordSize), rax);
10521 movq(Address(rsp, 14 * wordSize), rcx);
10522 movq(Address(rsp, 13 * wordSize), rdx);
10523 movq(Address(rsp, 12 * wordSize), rbx);
10524 movq(Address(rsp, 10 * wordSize), rbp);
10525 movq(Address(rsp, 9 * wordSize), rsi);
10526 movq(Address(rsp, 8 * wordSize), rdi);
10527 movq(Address(rsp, 7 * wordSize), r8);
10528 movq(Address(rsp, 6 * wordSize), r9);
10529 movq(Address(rsp, 5 * wordSize), r10);
10530 movq(Address(rsp, 4 * wordSize), r11);
10531 movq(Address(rsp, 3 * wordSize), r12);
10532 movq(Address(rsp, 2 * wordSize), r13);
10533 movq(Address(rsp, wordSize), r14);
10534 movq(Address(rsp, 0), r15);
10535 }
10536
10537 // Resotres back legacy GPRs state from stack.
10538 void MacroAssembler::restore_legacy_gprs() {
10539 movq(r15, Address(rsp, 0));
10540 movq(r14, Address(rsp, wordSize));
10541 movq(r13, Address(rsp, 2 * wordSize));
10542 movq(r12, Address(rsp, 3 * wordSize));
10543 movq(r11, Address(rsp, 4 * wordSize));
10544 movq(r10, Address(rsp, 5 * wordSize));
10545 movq(r9, Address(rsp, 6 * wordSize));
10546 movq(r8, Address(rsp, 7 * wordSize));
10547 movq(rdi, Address(rsp, 8 * wordSize));
10548 movq(rsi, Address(rsp, 9 * wordSize));
10549 movq(rbp, Address(rsp, 10 * wordSize));
10550 movq(rbx, Address(rsp, 12 * wordSize));
10551 movq(rdx, Address(rsp, 13 * wordSize));
10552 movq(rcx, Address(rsp, 14 * wordSize));
10553 movq(rax, Address(rsp, 15 * wordSize));
10554 addq(rsp, 16 * wordSize);
10555 }
10556
10557 void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) {
10558 if (VM_Version::supports_apx_f()) {
10559 esetzucc(comparison, dst);
10560 } else {
10561 setb(comparison, dst);
10562 movzbl(dst, dst);
10563 }
10564 }