1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "code/aotCodeCache.hpp"
28 #include "code/compiledIC.hpp"
29 #include "compiler/compiler_globals.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "crc32c.h"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/barrierSetAssembler.hpp"
34 #include "gc/shared/collectedHeap.inline.hpp"
35 #include "gc/shared/tlab_globals.hpp"
36 #include "interpreter/bytecodeHistogram.hpp"
37 #include "interpreter/interpreter.hpp"
38 #include "interpreter/interpreterRuntime.hpp"
39 #include "jvm.h"
40 #include "memory/resourceArea.hpp"
41 #include "memory/universe.hpp"
42 #include "oops/accessDecorators.hpp"
43 #include "oops/compressedKlass.inline.hpp"
44 #include "oops/compressedOops.inline.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/interfaceSupport.inline.hpp"
49 #include "runtime/javaThread.hpp"
50 #include "runtime/jniHandles.hpp"
51 #include "runtime/objectMonitor.hpp"
52 #include "runtime/os.hpp"
53 #include "runtime/safepoint.hpp"
54 #include "runtime/safepointMechanism.hpp"
55 #include "runtime/sharedRuntime.hpp"
56 #include "runtime/stubRoutines.hpp"
57 #include "utilities/checkedCast.hpp"
58 #include "utilities/globalDefinitions.hpp"
59 #include "utilities/macros.hpp"
60
61 #ifdef PRODUCT
62 #define BLOCK_COMMENT(str) /* nothing */
63 #define STOP(error) stop(error)
64 #else
65 #define BLOCK_COMMENT(str) block_comment(str)
66 #define STOP(error) block_comment(error); stop(error)
67 #endif
68
69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
70
71 #ifdef ASSERT
72 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
73 #endif
74
75 static const Assembler::Condition reverse[] = {
76 Assembler::noOverflow /* overflow = 0x0 */ ,
77 Assembler::overflow /* noOverflow = 0x1 */ ,
78 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
79 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
80 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
81 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
82 Assembler::above /* belowEqual = 0x6 */ ,
83 Assembler::belowEqual /* above = 0x7 */ ,
84 Assembler::positive /* negative = 0x8 */ ,
85 Assembler::negative /* positive = 0x9 */ ,
86 Assembler::noParity /* parity = 0xa */ ,
87 Assembler::parity /* noParity = 0xb */ ,
88 Assembler::greaterEqual /* less = 0xc */ ,
89 Assembler::less /* greaterEqual = 0xd */ ,
90 Assembler::greater /* lessEqual = 0xe */ ,
91 Assembler::lessEqual /* greater = 0xf, */
92
93 };
94
95
96 // Implementation of MacroAssembler
97
98 Address MacroAssembler::as_Address(AddressLiteral adr) {
99 // amd64 always does this as a pc-rel
100 // we can be absolute or disp based on the instruction type
101 // jmp/call are displacements others are absolute
102 assert(!adr.is_lval(), "must be rval");
103 assert(reachable(adr), "must be");
104 return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
105
106 }
107
108 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
109 AddressLiteral base = adr.base();
110 lea(rscratch, base);
111 Address index = adr.index();
112 assert(index._disp == 0, "must not have disp"); // maybe it can?
113 Address array(rscratch, index._index, index._scale, index._disp);
114 return array;
115 }
116
117 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
118 Label L, E;
119
120 #ifdef _WIN64
121 // Windows always allocates space for it's register args
122 assert(num_args <= 4, "only register arguments supported");
123 subq(rsp, frame::arg_reg_save_area_bytes);
124 #endif
125
126 // Align stack if necessary
127 testl(rsp, 15);
128 jcc(Assembler::zero, L);
129
130 subq(rsp, 8);
131 call(RuntimeAddress(entry_point));
132 addq(rsp, 8);
133 jmp(E);
134
135 bind(L);
136 call(RuntimeAddress(entry_point));
137
138 bind(E);
139
140 #ifdef _WIN64
141 // restore stack pointer
142 addq(rsp, frame::arg_reg_save_area_bytes);
143 #endif
144 }
145
146 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
147 assert(!src2.is_lval(), "should use cmpptr");
148 assert(rscratch != noreg || always_reachable(src2), "missing");
149
150 if (reachable(src2)) {
151 cmpq(src1, as_Address(src2));
152 } else {
153 lea(rscratch, src2);
154 Assembler::cmpq(src1, Address(rscratch, 0));
155 }
156 }
157
158 int MacroAssembler::corrected_idivq(Register reg) {
159 // Full implementation of Java ldiv and lrem; checks for special
160 // case as described in JVM spec., p.243 & p.271. The function
161 // returns the (pc) offset of the idivl instruction - may be needed
162 // for implicit exceptions.
163 //
164 // normal case special case
165 //
166 // input : rax: dividend min_long
167 // reg: divisor (may not be eax/edx) -1
168 //
169 // output: rax: quotient (= rax idiv reg) min_long
170 // rdx: remainder (= rax irem reg) 0
171 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
172 static const int64_t min_long = 0x8000000000000000;
173 Label normal_case, special_case;
174
175 // check for special case
176 cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
177 jcc(Assembler::notEqual, normal_case);
178 xorl(rdx, rdx); // prepare rdx for possible special case (where
179 // remainder = 0)
180 cmpq(reg, -1);
181 jcc(Assembler::equal, special_case);
182
183 // handle normal case
184 bind(normal_case);
185 cdqq();
186 int idivq_offset = offset();
187 idivq(reg);
188
189 // normal and special case exit
190 bind(special_case);
191
192 return idivq_offset;
193 }
194
195 void MacroAssembler::decrementq(Register reg, int value) {
196 if (value == min_jint) { subq(reg, value); return; }
197 if (value < 0) { incrementq(reg, -value); return; }
198 if (value == 0) { ; return; }
199 if (value == 1 && UseIncDec) { decq(reg) ; return; }
200 /* else */ { subq(reg, value) ; return; }
201 }
202
203 void MacroAssembler::decrementq(Address dst, int value) {
204 if (value == min_jint) { subq(dst, value); return; }
205 if (value < 0) { incrementq(dst, -value); return; }
206 if (value == 0) { ; return; }
207 if (value == 1 && UseIncDec) { decq(dst) ; return; }
208 /* else */ { subq(dst, value) ; return; }
209 }
210
211 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
212 assert(rscratch != noreg || always_reachable(dst), "missing");
213
214 if (reachable(dst)) {
215 incrementq(as_Address(dst));
216 } else {
217 lea(rscratch, dst);
218 incrementq(Address(rscratch, 0));
219 }
220 }
221
222 void MacroAssembler::incrementq(Register reg, int value) {
223 if (value == min_jint) { addq(reg, value); return; }
224 if (value < 0) { decrementq(reg, -value); return; }
225 if (value == 0) { ; return; }
226 if (value == 1 && UseIncDec) { incq(reg) ; return; }
227 /* else */ { addq(reg, value) ; return; }
228 }
229
230 void MacroAssembler::incrementq(Address dst, int value) {
231 if (value == min_jint) { addq(dst, value); return; }
232 if (value < 0) { decrementq(dst, -value); return; }
233 if (value == 0) { ; return; }
234 if (value == 1 && UseIncDec) { incq(dst) ; return; }
235 /* else */ { addq(dst, value) ; return; }
236 }
237
238 // 32bit can do a case table jump in one instruction but we no longer allow the base
239 // to be installed in the Address class
240 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
241 lea(rscratch, entry.base());
242 Address dispatch = entry.index();
243 assert(dispatch._base == noreg, "must be");
244 dispatch._base = rscratch;
245 jmp(dispatch);
246 }
247
248 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
249 ShouldNotReachHere(); // 64bit doesn't use two regs
250 cmpq(x_lo, y_lo);
251 }
252
253 void MacroAssembler::lea(Register dst, AddressLiteral src) {
254 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
255 }
256
257 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
258 lea(rscratch, adr);
259 movptr(dst, rscratch);
260 }
261
262 void MacroAssembler::leave() {
263 // %%% is this really better? Why not on 32bit too?
264 emit_int8((unsigned char)0xC9); // LEAVE
265 }
266
267 void MacroAssembler::lneg(Register hi, Register lo) {
268 ShouldNotReachHere(); // 64bit doesn't use two regs
269 negq(lo);
270 }
271
272 void MacroAssembler::movoop(Register dst, jobject obj) {
273 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
274 }
275
276 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
277 mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
278 movq(dst, rscratch);
279 }
280
281 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
282 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
283 }
284
285 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
286 mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
287 movq(dst, rscratch);
288 }
289
290 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
291 if (src.is_lval()) {
292 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
293 } else {
294 if (reachable(src)) {
295 movq(dst, as_Address(src));
296 } else {
297 lea(dst, src);
298 movq(dst, Address(dst, 0));
299 }
300 }
301 }
302
303 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
304 movq(as_Address(dst, rscratch), src);
305 }
306
307 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
308 movq(dst, as_Address(src, dst /*rscratch*/));
309 }
310
311 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
312 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
313 if (is_simm32(src)) {
314 movptr(dst, checked_cast<int32_t>(src));
315 } else {
316 mov64(rscratch, src);
317 movq(dst, rscratch);
318 }
319 }
320
321 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
322 movoop(rscratch, obj);
323 push(rscratch);
324 }
325
326 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
327 mov_metadata(rscratch, obj);
328 push(rscratch);
329 }
330
331 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
332 lea(rscratch, src);
333 if (src.is_lval()) {
334 push(rscratch);
335 } else {
336 pushq(Address(rscratch, 0));
337 }
338 }
339
340 static void pass_arg0(MacroAssembler* masm, Register arg) {
341 if (c_rarg0 != arg ) {
342 masm->mov(c_rarg0, arg);
343 }
344 }
345
346 static void pass_arg1(MacroAssembler* masm, Register arg) {
347 if (c_rarg1 != arg ) {
348 masm->mov(c_rarg1, arg);
349 }
350 }
351
352 static void pass_arg2(MacroAssembler* masm, Register arg) {
353 if (c_rarg2 != arg ) {
354 masm->mov(c_rarg2, arg);
355 }
356 }
357
358 static void pass_arg3(MacroAssembler* masm, Register arg) {
359 if (c_rarg3 != arg ) {
360 masm->mov(c_rarg3, arg);
361 }
362 }
363
364 void MacroAssembler::stop(const char* msg) {
365 if (ShowMessageBoxOnError) {
366 address rip = pc();
367 pusha(); // get regs on stack
368 lea(c_rarg1, InternalAddress(rip));
369 movq(c_rarg2, rsp); // pass pointer to regs array
370 }
371 // Skip AOT caching C strings in scratch buffer.
372 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
373 lea(c_rarg0, ExternalAddress((address) str));
374 andq(rsp, -16); // align stack as required by ABI
375 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
376 hlt();
377 }
378
379 void MacroAssembler::warn(const char* msg) {
380 push(rbp);
381 movq(rbp, rsp);
382 andq(rsp, -16); // align stack as required by push_CPU_state and call
383 push_CPU_state(); // keeps alignment at 16 bytes
384
385 #ifdef _WIN64
386 // Windows always allocates space for its register args
387 subq(rsp, frame::arg_reg_save_area_bytes);
388 #endif
389 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
390 lea(c_rarg0, ExternalAddress((address) str));
391 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
392
393 #ifdef _WIN64
394 // restore stack pointer
395 addq(rsp, frame::arg_reg_save_area_bytes);
396 #endif
397 pop_CPU_state();
398 mov(rsp, rbp);
399 pop(rbp);
400 }
401
402 void MacroAssembler::print_state() {
403 address rip = pc();
404 pusha(); // get regs on stack
405 push(rbp);
406 movq(rbp, rsp);
407 andq(rsp, -16); // align stack as required by push_CPU_state and call
408 push_CPU_state(); // keeps alignment at 16 bytes
409
410 lea(c_rarg0, InternalAddress(rip));
411 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
412 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
413
414 pop_CPU_state();
415 mov(rsp, rbp);
416 pop(rbp);
417 popa();
418 }
419
420 #ifndef PRODUCT
421 extern "C" void findpc(intptr_t x);
422 #endif
423
424 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
425 // In order to get locks to work, we need to fake a in_VM state
426 if (ShowMessageBoxOnError) {
427 JavaThread* thread = JavaThread::current();
428 JavaThreadState saved_state = thread->thread_state();
429 thread->set_thread_state(_thread_in_vm);
430 #ifndef PRODUCT
431 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
432 ttyLocker ttyl;
433 BytecodeCounter::print();
434 }
435 #endif
436 // To see where a verify_oop failed, get $ebx+40/X for this frame.
437 // XXX correct this offset for amd64
438 // This is the value of eip which points to where verify_oop will return.
439 if (os::message_box(msg, "Execution stopped, print registers?")) {
440 print_state64(pc, regs);
441 BREAKPOINT;
442 }
443 }
444 fatal("DEBUG MESSAGE: %s", msg);
445 }
446
447 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
448 ttyLocker ttyl;
449 DebuggingContext debugging{};
450 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
451 #ifndef PRODUCT
452 tty->cr();
453 findpc(pc);
454 tty->cr();
455 #endif
456 #define PRINT_REG(rax, value) \
457 { tty->print("%s = ", #rax); os::print_location(tty, value); }
458 PRINT_REG(rax, regs[15]);
459 PRINT_REG(rbx, regs[12]);
460 PRINT_REG(rcx, regs[14]);
461 PRINT_REG(rdx, regs[13]);
462 PRINT_REG(rdi, regs[8]);
463 PRINT_REG(rsi, regs[9]);
464 PRINT_REG(rbp, regs[10]);
465 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
466 PRINT_REG(rsp, (intptr_t)(®s[16]));
467 PRINT_REG(r8 , regs[7]);
468 PRINT_REG(r9 , regs[6]);
469 PRINT_REG(r10, regs[5]);
470 PRINT_REG(r11, regs[4]);
471 PRINT_REG(r12, regs[3]);
472 PRINT_REG(r13, regs[2]);
473 PRINT_REG(r14, regs[1]);
474 PRINT_REG(r15, regs[0]);
475 #undef PRINT_REG
476 // Print some words near the top of the stack.
477 int64_t* rsp = ®s[16];
478 int64_t* dump_sp = rsp;
479 for (int col1 = 0; col1 < 8; col1++) {
480 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
481 os::print_location(tty, *dump_sp++);
482 }
483 for (int row = 0; row < 25; row++) {
484 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
485 for (int col = 0; col < 4; col++) {
486 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
487 }
488 tty->cr();
489 }
490 // Print some instructions around pc:
491 Disassembler::decode((address)pc-64, (address)pc);
492 tty->print_cr("--------");
493 Disassembler::decode((address)pc, (address)pc+32);
494 }
495
496 // The java_calling_convention describes stack locations as ideal slots on
497 // a frame with no abi restrictions. Since we must observe abi restrictions
498 // (like the placement of the register window) the slots must be biased by
499 // the following value.
500 static int reg2offset_in(VMReg r) {
501 // Account for saved rbp and return address
502 // This should really be in_preserve_stack_slots
503 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
504 }
505
506 static int reg2offset_out(VMReg r) {
507 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
508 }
509
510 // A long move
511 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
512
513 // The calling conventions assures us that each VMregpair is either
514 // all really one physical register or adjacent stack slots.
515
516 if (src.is_single_phys_reg() ) {
517 if (dst.is_single_phys_reg()) {
518 if (dst.first() != src.first()) {
519 mov(dst.first()->as_Register(), src.first()->as_Register());
520 }
521 } else {
522 assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
523 src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
524 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
525 }
526 } else if (dst.is_single_phys_reg()) {
527 assert(src.is_single_reg(), "not a stack pair");
528 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
529 } else {
530 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
531 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
532 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
533 }
534 }
535
536 // A double move
537 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
538
539 // The calling conventions assures us that each VMregpair is either
540 // all really one physical register or adjacent stack slots.
541
542 if (src.is_single_phys_reg() ) {
543 if (dst.is_single_phys_reg()) {
544 // In theory these overlap but the ordering is such that this is likely a nop
545 if ( src.first() != dst.first()) {
546 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
547 }
548 } else {
549 assert(dst.is_single_reg(), "not a stack pair");
550 movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
551 }
552 } else if (dst.is_single_phys_reg()) {
553 assert(src.is_single_reg(), "not a stack pair");
554 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
555 } else {
556 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
557 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
558 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
559 }
560 }
561
562
563 // A float arg may have to do float reg int reg conversion
564 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
565 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
566
567 // The calling conventions assures us that each VMregpair is either
568 // all really one physical register or adjacent stack slots.
569
570 if (src.first()->is_stack()) {
571 if (dst.first()->is_stack()) {
572 movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
573 movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
574 } else {
575 // stack to reg
576 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
577 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
578 }
579 } else if (dst.first()->is_stack()) {
580 // reg to stack
581 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
582 movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
583 } else {
584 // reg to reg
585 // In theory these overlap but the ordering is such that this is likely a nop
586 if ( src.first() != dst.first()) {
587 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
588 }
589 }
590 }
591
592 // On 64 bit we will store integer like items to the stack as
593 // 64 bits items (x86_32/64 abi) even though java would only store
594 // 32bits for a parameter. On 32bit it will simply be 32 bits
595 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
596 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
597 if (src.first()->is_stack()) {
598 if (dst.first()->is_stack()) {
599 // stack to stack
600 movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
601 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
602 } else {
603 // stack to reg
604 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
605 }
606 } else if (dst.first()->is_stack()) {
607 // reg to stack
608 // Do we really have to sign extend???
609 // __ movslq(src.first()->as_Register(), src.first()->as_Register());
610 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
611 } else {
612 // Do we really have to sign extend???
613 // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
614 if (dst.first() != src.first()) {
615 movq(dst.first()->as_Register(), src.first()->as_Register());
616 }
617 }
618 }
619
620 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
621 if (src.first()->is_stack()) {
622 if (dst.first()->is_stack()) {
623 // stack to stack
624 movq(rax, Address(rbp, reg2offset_in(src.first())));
625 movq(Address(rsp, reg2offset_out(dst.first())), rax);
626 } else {
627 // stack to reg
628 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
629 }
630 } else if (dst.first()->is_stack()) {
631 // reg to stack
632 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
633 } else {
634 if (dst.first() != src.first()) {
635 movq(dst.first()->as_Register(), src.first()->as_Register());
636 }
637 }
638 }
639
640 // An oop arg. Must pass a handle not the oop itself
641 void MacroAssembler::object_move(OopMap* map,
642 int oop_handle_offset,
643 int framesize_in_slots,
644 VMRegPair src,
645 VMRegPair dst,
646 bool is_receiver,
647 int* receiver_offset) {
648
649 // must pass a handle. First figure out the location we use as a handle
650
651 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
652
653 // See if oop is null if it is we need no handle
654
655 if (src.first()->is_stack()) {
656
657 // Oop is already on the stack as an argument
658 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
659 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
660 if (is_receiver) {
661 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
662 }
663
664 cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
665 lea(rHandle, Address(rbp, reg2offset_in(src.first())));
666 // conditionally move a null
667 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
668 } else {
669
670 // Oop is in a register we must store it to the space we reserve
671 // on the stack for oop_handles and pass a handle if oop is non-null
672
673 const Register rOop = src.first()->as_Register();
674 int oop_slot;
675 if (rOop == j_rarg0)
676 oop_slot = 0;
677 else if (rOop == j_rarg1)
678 oop_slot = 1;
679 else if (rOop == j_rarg2)
680 oop_slot = 2;
681 else if (rOop == j_rarg3)
682 oop_slot = 3;
683 else if (rOop == j_rarg4)
684 oop_slot = 4;
685 else {
686 assert(rOop == j_rarg5, "wrong register");
687 oop_slot = 5;
688 }
689
690 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
691 int offset = oop_slot*VMRegImpl::stack_slot_size;
692
693 map->set_oop(VMRegImpl::stack2reg(oop_slot));
694 // Store oop in handle area, may be null
695 movptr(Address(rsp, offset), rOop);
696 if (is_receiver) {
697 *receiver_offset = offset;
698 }
699
700 cmpptr(rOop, NULL_WORD);
701 lea(rHandle, Address(rsp, offset));
702 // conditionally move a null from the handle area where it was just stored
703 cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
704 }
705
706 // If arg is on the stack then place it otherwise it is already in correct reg.
707 if (dst.first()->is_stack()) {
708 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
709 }
710 }
711
712 void MacroAssembler::addptr(Register dst, int32_t imm32) {
713 addq(dst, imm32);
714 }
715
716 void MacroAssembler::addptr(Register dst, Register src) {
717 addq(dst, src);
718 }
719
720 void MacroAssembler::addptr(Address dst, Register src) {
721 addq(dst, src);
722 }
723
724 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
725 assert(rscratch != noreg || always_reachable(src), "missing");
726
727 if (reachable(src)) {
728 Assembler::addsd(dst, as_Address(src));
729 } else {
730 lea(rscratch, src);
731 Assembler::addsd(dst, Address(rscratch, 0));
732 }
733 }
734
735 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
736 assert(rscratch != noreg || always_reachable(src), "missing");
737
738 if (reachable(src)) {
739 addss(dst, as_Address(src));
740 } else {
741 lea(rscratch, src);
742 addss(dst, Address(rscratch, 0));
743 }
744 }
745
746 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
747 assert(rscratch != noreg || always_reachable(src), "missing");
748
749 if (reachable(src)) {
750 Assembler::addpd(dst, as_Address(src));
751 } else {
752 lea(rscratch, src);
753 Assembler::addpd(dst, Address(rscratch, 0));
754 }
755 }
756
757 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only.
758 // Stub code is generated once and never copied.
759 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
760 void MacroAssembler::align64() {
761 align(64, (uint)(uintptr_t)pc());
762 }
763
764 void MacroAssembler::align32() {
765 align(32, (uint)(uintptr_t)pc());
766 }
767
768 void MacroAssembler::align(uint modulus) {
769 // 8273459: Ensure alignment is possible with current segment alignment
770 assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
771 align(modulus, offset());
772 }
773
774 void MacroAssembler::align(uint modulus, uint target) {
775 if (target % modulus != 0) {
776 nop(modulus - (target % modulus));
777 }
778 }
779
780 void MacroAssembler::push_f(XMMRegister r) {
781 subptr(rsp, wordSize);
782 movflt(Address(rsp, 0), r);
783 }
784
785 void MacroAssembler::pop_f(XMMRegister r) {
786 movflt(r, Address(rsp, 0));
787 addptr(rsp, wordSize);
788 }
789
790 void MacroAssembler::push_d(XMMRegister r) {
791 subptr(rsp, 2 * wordSize);
792 movdbl(Address(rsp, 0), r);
793 }
794
795 void MacroAssembler::pop_d(XMMRegister r) {
796 movdbl(r, Address(rsp, 0));
797 addptr(rsp, 2 * Interpreter::stackElementSize);
798 }
799
800 void MacroAssembler::push_ppx(Register src) {
801 if (VM_Version::supports_apx_f()) {
802 pushp(src);
803 } else {
804 Assembler::push(src);
805 }
806 }
807
808 void MacroAssembler::pop_ppx(Register dst) {
809 if (VM_Version::supports_apx_f()) {
810 popp(dst);
811 } else {
812 Assembler::pop(dst);
813 }
814 }
815
816 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
817 // Used in sign-masking with aligned address.
818 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
819 assert(rscratch != noreg || always_reachable(src), "missing");
820
821 if (UseAVX > 2 &&
822 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
823 (dst->encoding() >= 16)) {
824 vpand(dst, dst, src, AVX_512bit, rscratch);
825 } else if (reachable(src)) {
826 Assembler::andpd(dst, as_Address(src));
827 } else {
828 lea(rscratch, src);
829 Assembler::andpd(dst, Address(rscratch, 0));
830 }
831 }
832
833 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
834 // Used in sign-masking with aligned address.
835 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
836 assert(rscratch != noreg || always_reachable(src), "missing");
837
838 if (reachable(src)) {
839 Assembler::andps(dst, as_Address(src));
840 } else {
841 lea(rscratch, src);
842 Assembler::andps(dst, Address(rscratch, 0));
843 }
844 }
845
846 void MacroAssembler::andptr(Register dst, int32_t imm32) {
847 andq(dst, imm32);
848 }
849
850 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
851 assert(rscratch != noreg || always_reachable(src), "missing");
852
853 if (reachable(src)) {
854 andq(dst, as_Address(src));
855 } else {
856 lea(rscratch, src);
857 andq(dst, Address(rscratch, 0));
858 }
859 }
860
861 void MacroAssembler::atomic_incl(Address counter_addr) {
862 lock();
863 incrementl(counter_addr);
864 }
865
866 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
867 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
868
869 if (reachable(counter_addr)) {
870 atomic_incl(as_Address(counter_addr));
871 } else {
872 lea(rscratch, counter_addr);
873 atomic_incl(Address(rscratch, 0));
874 }
875 }
876
877 void MacroAssembler::atomic_incq(Address counter_addr) {
878 lock();
879 incrementq(counter_addr);
880 }
881
882 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
883 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
884
885 if (reachable(counter_addr)) {
886 atomic_incq(as_Address(counter_addr));
887 } else {
888 lea(rscratch, counter_addr);
889 atomic_incq(Address(rscratch, 0));
890 }
891 }
892
893 // Writes to stack successive pages until offset reached to check for
894 // stack overflow + shadow pages. This clobbers tmp.
895 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
896 movptr(tmp, rsp);
897 // Bang stack for total size given plus shadow page size.
898 // Bang one page at a time because large size can bang beyond yellow and
899 // red zones.
900 Label loop;
901 bind(loop);
902 movl(Address(tmp, (-(int)os::vm_page_size())), size );
903 subptr(tmp, (int)os::vm_page_size());
904 subl(size, (int)os::vm_page_size());
905 jcc(Assembler::greater, loop);
906
907 // Bang down shadow pages too.
908 // At this point, (tmp-0) is the last address touched, so don't
909 // touch it again. (It was touched as (tmp-pagesize) but then tmp
910 // was post-decremented.) Skip this address by starting at i=1, and
911 // touch a few more pages below. N.B. It is important to touch all
912 // the way down including all pages in the shadow zone.
913 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
914 // this could be any sized move but this is can be a debugging crumb
915 // so the bigger the better.
916 movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
917 }
918 }
919
920 void MacroAssembler::reserved_stack_check() {
921 // testing if reserved zone needs to be enabled
922 Label no_reserved_zone_enabling;
923
924 cmpptr(rsp, Address(r15_thread, JavaThread::reserved_stack_activation_offset()));
925 jcc(Assembler::below, no_reserved_zone_enabling);
926
927 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), r15_thread);
928 jump(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
929 should_not_reach_here();
930
931 bind(no_reserved_zone_enabling);
932 }
933
934 void MacroAssembler::c2bool(Register x) {
935 // implements x == 0 ? 0 : 1
936 // note: must only look at least-significant byte of x
937 // since C-style booleans are stored in one byte
938 // only! (was bug)
939 andl(x, 0xFF);
940 setb(Assembler::notZero, x);
941 }
942
943 // Wouldn't need if AddressLiteral version had new name
944 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
945 Assembler::call(L, rtype);
946 }
947
948 void MacroAssembler::call(Register entry) {
949 Assembler::call(entry);
950 }
951
952 void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
953 assert(rscratch != noreg || always_reachable(entry), "missing");
954
955 if (reachable(entry)) {
956 Assembler::call_literal(entry.target(), entry.rspec());
957 } else {
958 lea(rscratch, entry);
959 Assembler::call(rscratch);
960 }
961 }
962
963 void MacroAssembler::ic_call(address entry, jint method_index) {
964 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
965 // Needs full 64-bit immediate for later patching.
966 Assembler::mov64(rax, (int64_t)Universe::non_oop_word());
967 call(AddressLiteral(entry, rh));
968 }
969
970 int MacroAssembler::ic_check_size() {
971 return UseCompactObjectHeaders ? 17 : 14;
972 }
973
974 int MacroAssembler::ic_check(int end_alignment) {
975 Register receiver = j_rarg0;
976 Register data = rax;
977 Register temp = rscratch1;
978
979 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
980 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
981 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
982 // before the inline cache check here, and not after
983 align(end_alignment, offset() + ic_check_size());
984
985 int uep_offset = offset();
986
987 if (UseCompactObjectHeaders) {
988 load_narrow_klass_compact(temp, receiver);
989 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
990 } else {
991 movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
992 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
993 }
994
995 // if inline cache check fails, then jump to runtime routine
996 jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
997 assert((offset() % end_alignment) == 0, "Misaligned verified entry point (%d, %d, %d)", uep_offset, offset(), end_alignment);
998
999 return uep_offset;
1000 }
1001
1002 void MacroAssembler::emit_static_call_stub() {
1003 // Static stub relocation also tags the Method* in the code-stream.
1004 mov_metadata(rbx, (Metadata*) nullptr); // Method is zapped till fixup time.
1005 // This is recognized as unresolved by relocs/nativeinst/ic code.
1006 jump(RuntimeAddress(pc()));
1007 }
1008
1009 // Implementation of call_VM versions
1010
1011 void MacroAssembler::call_VM(Register oop_result,
1012 address entry_point,
1013 bool check_exceptions) {
1014 Label C, E;
1015 call(C, relocInfo::none);
1016 jmp(E);
1017
1018 bind(C);
1019 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1020 ret(0);
1021
1022 bind(E);
1023 }
1024
1025 void MacroAssembler::call_VM(Register oop_result,
1026 address entry_point,
1027 Register arg_1,
1028 bool check_exceptions) {
1029 Label C, E;
1030 call(C, relocInfo::none);
1031 jmp(E);
1032
1033 bind(C);
1034 pass_arg1(this, arg_1);
1035 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1036 ret(0);
1037
1038 bind(E);
1039 }
1040
1041 void MacroAssembler::call_VM(Register oop_result,
1042 address entry_point,
1043 Register arg_1,
1044 Register arg_2,
1045 bool check_exceptions) {
1046 Label C, E;
1047 call(C, relocInfo::none);
1048 jmp(E);
1049
1050 bind(C);
1051
1052 assert_different_registers(arg_1, c_rarg2);
1053
1054 pass_arg2(this, arg_2);
1055 pass_arg1(this, arg_1);
1056 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1057 ret(0);
1058
1059 bind(E);
1060 }
1061
1062 void MacroAssembler::call_VM(Register oop_result,
1063 address entry_point,
1064 Register arg_1,
1065 Register arg_2,
1066 Register arg_3,
1067 bool check_exceptions) {
1068 Label C, E;
1069 call(C, relocInfo::none);
1070 jmp(E);
1071
1072 bind(C);
1073
1074 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1075 assert_different_registers(arg_2, c_rarg3);
1076 pass_arg3(this, arg_3);
1077 pass_arg2(this, arg_2);
1078 pass_arg1(this, arg_1);
1079 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1080 ret(0);
1081
1082 bind(E);
1083 }
1084
1085 void MacroAssembler::call_VM(Register oop_result,
1086 Register last_java_sp,
1087 address entry_point,
1088 int number_of_arguments,
1089 bool check_exceptions) {
1090 call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1091 }
1092
1093 void MacroAssembler::call_VM(Register oop_result,
1094 Register last_java_sp,
1095 address entry_point,
1096 Register arg_1,
1097 bool check_exceptions) {
1098 pass_arg1(this, arg_1);
1099 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1100 }
1101
1102 void MacroAssembler::call_VM(Register oop_result,
1103 Register last_java_sp,
1104 address entry_point,
1105 Register arg_1,
1106 Register arg_2,
1107 bool check_exceptions) {
1108
1109 assert_different_registers(arg_1, c_rarg2);
1110 pass_arg2(this, arg_2);
1111 pass_arg1(this, arg_1);
1112 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1113 }
1114
1115 void MacroAssembler::call_VM(Register oop_result,
1116 Register last_java_sp,
1117 address entry_point,
1118 Register arg_1,
1119 Register arg_2,
1120 Register arg_3,
1121 bool check_exceptions) {
1122 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1123 assert_different_registers(arg_2, c_rarg3);
1124 pass_arg3(this, arg_3);
1125 pass_arg2(this, arg_2);
1126 pass_arg1(this, arg_1);
1127 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1128 }
1129
1130 void MacroAssembler::super_call_VM(Register oop_result,
1131 Register last_java_sp,
1132 address entry_point,
1133 int number_of_arguments,
1134 bool check_exceptions) {
1135 MacroAssembler::call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1136 }
1137
1138 void MacroAssembler::super_call_VM(Register oop_result,
1139 Register last_java_sp,
1140 address entry_point,
1141 Register arg_1,
1142 bool check_exceptions) {
1143 pass_arg1(this, arg_1);
1144 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1145 }
1146
1147 void MacroAssembler::super_call_VM(Register oop_result,
1148 Register last_java_sp,
1149 address entry_point,
1150 Register arg_1,
1151 Register arg_2,
1152 bool check_exceptions) {
1153
1154 assert_different_registers(arg_1, c_rarg2);
1155 pass_arg2(this, arg_2);
1156 pass_arg1(this, arg_1);
1157 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1158 }
1159
1160 void MacroAssembler::super_call_VM(Register oop_result,
1161 Register last_java_sp,
1162 address entry_point,
1163 Register arg_1,
1164 Register arg_2,
1165 Register arg_3,
1166 bool check_exceptions) {
1167 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1168 assert_different_registers(arg_2, c_rarg3);
1169 pass_arg3(this, arg_3);
1170 pass_arg2(this, arg_2);
1171 pass_arg1(this, arg_1);
1172 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1173 }
1174
1175 void MacroAssembler::call_VM_base(Register oop_result,
1176 Register last_java_sp,
1177 address entry_point,
1178 int number_of_arguments,
1179 bool check_exceptions) {
1180 Register java_thread = r15_thread;
1181
1182 // determine last_java_sp register
1183 if (!last_java_sp->is_valid()) {
1184 last_java_sp = rsp;
1185 }
1186 // debugging support
1187 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1188 #ifdef ASSERT
1189 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1190 // r12 is the heapbase.
1191 if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
1192 #endif // ASSERT
1193
1194 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1195 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1196
1197 // push java thread (becomes first argument of C function)
1198
1199 mov(c_rarg0, r15_thread);
1200
1201 // set last Java frame before call
1202 assert(last_java_sp != rbp, "can't use ebp/rbp");
1203
1204 // Only interpreter should have to set fp
1205 set_last_Java_frame(last_java_sp, rbp, nullptr, rscratch1);
1206
1207 // do the call, remove parameters
1208 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1209
1210 #ifdef ASSERT
1211 // Check that thread register is not clobbered.
1212 guarantee(java_thread != rax, "change this code");
1213 push(rax);
1214 { Label L;
1215 get_thread_slow(rax);
1216 cmpptr(java_thread, rax);
1217 jcc(Assembler::equal, L);
1218 STOP("MacroAssembler::call_VM_base: java_thread not callee saved?");
1219 bind(L);
1220 }
1221 pop(rax);
1222 #endif
1223
1224 // reset last Java frame
1225 // Only interpreter should have to clear fp
1226 reset_last_Java_frame(true);
1227
1228 // C++ interp handles this in the interpreter
1229 check_and_handle_popframe();
1230 check_and_handle_earlyret();
1231
1232 if (check_exceptions) {
1233 // check for pending exceptions (java_thread is set upon return)
1234 cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1235 // This used to conditionally jump to forward_exception however it is
1236 // possible if we relocate that the branch will not reach. So we must jump
1237 // around so we can always reach
1238
1239 Label ok;
1240 jcc(Assembler::equal, ok);
1241 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1242 bind(ok);
1243 }
1244
1245 // get oop result if there is one and reset the value in the thread
1246 if (oop_result->is_valid()) {
1247 get_vm_result_oop(oop_result);
1248 }
1249 }
1250
1251 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1252 // Calculate the value for last_Java_sp somewhat subtle.
1253 // call_VM does an intermediate call which places a return address on
1254 // the stack just under the stack pointer as the user finished with it.
1255 // This allows use to retrieve last_Java_pc from last_Java_sp[-1].
1256
1257 // We've pushed one address, correct last_Java_sp
1258 lea(rax, Address(rsp, wordSize));
1259
1260 call_VM_base(oop_result, rax, entry_point, number_of_arguments, check_exceptions);
1261 }
1262
1263 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1264 void MacroAssembler::call_VM_leaf0(address entry_point) {
1265 MacroAssembler::call_VM_leaf_base(entry_point, 0);
1266 }
1267
1268 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1269 call_VM_leaf_base(entry_point, number_of_arguments);
1270 }
1271
1272 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1273 pass_arg0(this, arg_0);
1274 call_VM_leaf(entry_point, 1);
1275 }
1276
1277 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1278
1279 assert_different_registers(arg_0, c_rarg1);
1280 pass_arg1(this, arg_1);
1281 pass_arg0(this, arg_0);
1282 call_VM_leaf(entry_point, 2);
1283 }
1284
1285 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1286 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1287 assert_different_registers(arg_1, c_rarg2);
1288 pass_arg2(this, arg_2);
1289 pass_arg1(this, arg_1);
1290 pass_arg0(this, arg_0);
1291 call_VM_leaf(entry_point, 3);
1292 }
1293
1294 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1295 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1296 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1297 assert_different_registers(arg_2, c_rarg3);
1298 pass_arg3(this, arg_3);
1299 pass_arg2(this, arg_2);
1300 pass_arg1(this, arg_1);
1301 pass_arg0(this, arg_0);
1302 call_VM_leaf(entry_point, 3);
1303 }
1304
1305 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1306 pass_arg0(this, arg_0);
1307 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1308 }
1309
1310 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1311 assert_different_registers(arg_0, c_rarg1);
1312 pass_arg1(this, arg_1);
1313 pass_arg0(this, arg_0);
1314 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1315 }
1316
1317 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1318 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1319 assert_different_registers(arg_1, c_rarg2);
1320 pass_arg2(this, arg_2);
1321 pass_arg1(this, arg_1);
1322 pass_arg0(this, arg_0);
1323 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1324 }
1325
1326 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1327 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1328 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1329 assert_different_registers(arg_2, c_rarg3);
1330 pass_arg3(this, arg_3);
1331 pass_arg2(this, arg_2);
1332 pass_arg1(this, arg_1);
1333 pass_arg0(this, arg_0);
1334 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1335 }
1336
1337 void MacroAssembler::get_vm_result_oop(Register oop_result) {
1338 movptr(oop_result, Address(r15_thread, JavaThread::vm_result_oop_offset()));
1339 movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
1340 verify_oop_msg(oop_result, "broken oop in call_VM_base");
1341 }
1342
1343 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
1344 movptr(metadata_result, Address(r15_thread, JavaThread::vm_result_metadata_offset()));
1345 movptr(Address(r15_thread, JavaThread::vm_result_metadata_offset()), NULL_WORD);
1346 }
1347
1348 void MacroAssembler::check_and_handle_earlyret() {
1349 }
1350
1351 void MacroAssembler::check_and_handle_popframe() {
1352 }
1353
1354 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1355 assert(rscratch != noreg || always_reachable(src1), "missing");
1356
1357 if (reachable(src1)) {
1358 cmpl(as_Address(src1), imm);
1359 } else {
1360 lea(rscratch, src1);
1361 cmpl(Address(rscratch, 0), imm);
1362 }
1363 }
1364
1365 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1366 assert(!src2.is_lval(), "use cmpptr");
1367 assert(rscratch != noreg || always_reachable(src2), "missing");
1368
1369 if (reachable(src2)) {
1370 cmpl(src1, as_Address(src2));
1371 } else {
1372 lea(rscratch, src2);
1373 cmpl(src1, Address(rscratch, 0));
1374 }
1375 }
1376
1377 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1378 Assembler::cmpl(src1, imm);
1379 }
1380
1381 void MacroAssembler::cmp32(Register src1, Address src2) {
1382 Assembler::cmpl(src1, src2);
1383 }
1384
1385 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1386 ucomisd(opr1, opr2);
1387
1388 Label L;
1389 if (unordered_is_less) {
1390 movl(dst, -1);
1391 jcc(Assembler::parity, L);
1392 jcc(Assembler::below , L);
1393 movl(dst, 0);
1394 jcc(Assembler::equal , L);
1395 increment(dst);
1396 } else { // unordered is greater
1397 movl(dst, 1);
1398 jcc(Assembler::parity, L);
1399 jcc(Assembler::above , L);
1400 movl(dst, 0);
1401 jcc(Assembler::equal , L);
1402 decrementl(dst);
1403 }
1404 bind(L);
1405 }
1406
1407 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1408 ucomiss(opr1, opr2);
1409
1410 Label L;
1411 if (unordered_is_less) {
1412 movl(dst, -1);
1413 jcc(Assembler::parity, L);
1414 jcc(Assembler::below , L);
1415 movl(dst, 0);
1416 jcc(Assembler::equal , L);
1417 increment(dst);
1418 } else { // unordered is greater
1419 movl(dst, 1);
1420 jcc(Assembler::parity, L);
1421 jcc(Assembler::above , L);
1422 movl(dst, 0);
1423 jcc(Assembler::equal , L);
1424 decrementl(dst);
1425 }
1426 bind(L);
1427 }
1428
1429
1430 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1431 assert(rscratch != noreg || always_reachable(src1), "missing");
1432
1433 if (reachable(src1)) {
1434 cmpb(as_Address(src1), imm);
1435 } else {
1436 lea(rscratch, src1);
1437 cmpb(Address(rscratch, 0), imm);
1438 }
1439 }
1440
1441 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1442 assert(rscratch != noreg || always_reachable(src2), "missing");
1443
1444 if (src2.is_lval()) {
1445 movptr(rscratch, src2);
1446 Assembler::cmpq(src1, rscratch);
1447 } else if (reachable(src2)) {
1448 cmpq(src1, as_Address(src2));
1449 } else {
1450 lea(rscratch, src2);
1451 Assembler::cmpq(src1, Address(rscratch, 0));
1452 }
1453 }
1454
1455 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1456 assert(src2.is_lval(), "not a mem-mem compare");
1457 // moves src2's literal address
1458 movptr(rscratch, src2);
1459 Assembler::cmpq(src1, rscratch);
1460 }
1461
1462 void MacroAssembler::cmpoop(Register src1, Register src2) {
1463 cmpptr(src1, src2);
1464 }
1465
1466 void MacroAssembler::cmpoop(Register src1, Address src2) {
1467 cmpptr(src1, src2);
1468 }
1469
1470 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1471 movoop(rscratch, src2);
1472 cmpptr(src1, rscratch);
1473 }
1474
1475 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1476 assert(rscratch != noreg || always_reachable(adr), "missing");
1477
1478 if (reachable(adr)) {
1479 lock();
1480 cmpxchgptr(reg, as_Address(adr));
1481 } else {
1482 lea(rscratch, adr);
1483 lock();
1484 cmpxchgptr(reg, Address(rscratch, 0));
1485 }
1486 }
1487
1488 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1489 cmpxchgq(reg, adr);
1490 }
1491
1492 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1493 assert(rscratch != noreg || always_reachable(src), "missing");
1494
1495 if (reachable(src)) {
1496 Assembler::comisd(dst, as_Address(src));
1497 } else {
1498 lea(rscratch, src);
1499 Assembler::comisd(dst, Address(rscratch, 0));
1500 }
1501 }
1502
1503 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1504 assert(rscratch != noreg || always_reachable(src), "missing");
1505
1506 if (reachable(src)) {
1507 Assembler::comiss(dst, as_Address(src));
1508 } else {
1509 lea(rscratch, src);
1510 Assembler::comiss(dst, Address(rscratch, 0));
1511 }
1512 }
1513
1514
1515 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1516 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1517
1518 Condition negated_cond = negate_condition(cond);
1519 Label L;
1520 jcc(negated_cond, L);
1521 pushf(); // Preserve flags
1522 atomic_incl(counter_addr, rscratch);
1523 popf();
1524 bind(L);
1525 }
1526
1527 int MacroAssembler::corrected_idivl(Register reg) {
1528 // Full implementation of Java idiv and irem; checks for
1529 // special case as described in JVM spec., p.243 & p.271.
1530 // The function returns the (pc) offset of the idivl
1531 // instruction - may be needed for implicit exceptions.
1532 //
1533 // normal case special case
1534 //
1535 // input : rax,: dividend min_int
1536 // reg: divisor (may not be rax,/rdx) -1
1537 //
1538 // output: rax,: quotient (= rax, idiv reg) min_int
1539 // rdx: remainder (= rax, irem reg) 0
1540 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1541 const int min_int = 0x80000000;
1542 Label normal_case, special_case;
1543
1544 // check for special case
1545 cmpl(rax, min_int);
1546 jcc(Assembler::notEqual, normal_case);
1547 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1548 cmpl(reg, -1);
1549 jcc(Assembler::equal, special_case);
1550
1551 // handle normal case
1552 bind(normal_case);
1553 cdql();
1554 int idivl_offset = offset();
1555 idivl(reg);
1556
1557 // normal and special case exit
1558 bind(special_case);
1559
1560 return idivl_offset;
1561 }
1562
1563
1564
1565 void MacroAssembler::decrementl(Register reg, int value) {
1566 if (value == min_jint) {subl(reg, value) ; return; }
1567 if (value < 0) { incrementl(reg, -value); return; }
1568 if (value == 0) { ; return; }
1569 if (value == 1 && UseIncDec) { decl(reg) ; return; }
1570 /* else */ { subl(reg, value) ; return; }
1571 }
1572
1573 void MacroAssembler::decrementl(Address dst, int value) {
1574 if (value == min_jint) {subl(dst, value) ; return; }
1575 if (value < 0) { incrementl(dst, -value); return; }
1576 if (value == 0) { ; return; }
1577 if (value == 1 && UseIncDec) { decl(dst) ; return; }
1578 /* else */ { subl(dst, value) ; return; }
1579 }
1580
1581 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1582 assert(shift_value > 0, "illegal shift value");
1583 Label _is_positive;
1584 testl (reg, reg);
1585 jcc (Assembler::positive, _is_positive);
1586 int offset = (1 << shift_value) - 1 ;
1587
1588 if (offset == 1) {
1589 incrementl(reg);
1590 } else {
1591 addl(reg, offset);
1592 }
1593
1594 bind (_is_positive);
1595 sarl(reg, shift_value);
1596 }
1597
1598 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1599 assert(rscratch != noreg || always_reachable(src), "missing");
1600
1601 if (reachable(src)) {
1602 Assembler::divsd(dst, as_Address(src));
1603 } else {
1604 lea(rscratch, src);
1605 Assembler::divsd(dst, Address(rscratch, 0));
1606 }
1607 }
1608
1609 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1610 assert(rscratch != noreg || always_reachable(src), "missing");
1611
1612 if (reachable(src)) {
1613 Assembler::divss(dst, as_Address(src));
1614 } else {
1615 lea(rscratch, src);
1616 Assembler::divss(dst, Address(rscratch, 0));
1617 }
1618 }
1619
1620 void MacroAssembler::enter() {
1621 push(rbp);
1622 mov(rbp, rsp);
1623 }
1624
1625 void MacroAssembler::post_call_nop() {
1626 if (!Continuations::enabled()) {
1627 return;
1628 }
1629 InstructionMark im(this);
1630 relocate(post_call_nop_Relocation::spec());
1631 InlineSkippedInstructionsCounter skipCounter(this);
1632 emit_int8((uint8_t)0x0f);
1633 emit_int8((uint8_t)0x1f);
1634 emit_int8((uint8_t)0x84);
1635 emit_int8((uint8_t)0x00);
1636 emit_int32(0x00);
1637 }
1638
1639 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1640 assert(rscratch != noreg || always_reachable(src), "missing");
1641 if (reachable(src)) {
1642 Assembler::mulpd(dst, as_Address(src));
1643 } else {
1644 lea(rscratch, src);
1645 Assembler::mulpd(dst, Address(rscratch, 0));
1646 }
1647 }
1648
1649 // dst = c = a * b + c
1650 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1651 Assembler::vfmadd231sd(c, a, b);
1652 if (dst != c) {
1653 movdbl(dst, c);
1654 }
1655 }
1656
1657 // dst = c = a * b + c
1658 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1659 Assembler::vfmadd231ss(c, a, b);
1660 if (dst != c) {
1661 movflt(dst, c);
1662 }
1663 }
1664
1665 // dst = c = a * b + c
1666 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1667 Assembler::vfmadd231pd(c, a, b, vector_len);
1668 if (dst != c) {
1669 vmovdqu(dst, c);
1670 }
1671 }
1672
1673 // dst = c = a * b + c
1674 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1675 Assembler::vfmadd231ps(c, a, b, vector_len);
1676 if (dst != c) {
1677 vmovdqu(dst, c);
1678 }
1679 }
1680
1681 // dst = c = a * b + c
1682 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1683 Assembler::vfmadd231pd(c, a, b, vector_len);
1684 if (dst != c) {
1685 vmovdqu(dst, c);
1686 }
1687 }
1688
1689 // dst = c = a * b + c
1690 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1691 Assembler::vfmadd231ps(c, a, b, vector_len);
1692 if (dst != c) {
1693 vmovdqu(dst, c);
1694 }
1695 }
1696
1697 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
1698 assert(rscratch != noreg || always_reachable(dst), "missing");
1699
1700 if (reachable(dst)) {
1701 incrementl(as_Address(dst));
1702 } else {
1703 lea(rscratch, dst);
1704 incrementl(Address(rscratch, 0));
1705 }
1706 }
1707
1708 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
1709 incrementl(as_Address(dst, rscratch));
1710 }
1711
1712 void MacroAssembler::incrementl(Register reg, int value) {
1713 if (value == min_jint) {addl(reg, value) ; return; }
1714 if (value < 0) { decrementl(reg, -value); return; }
1715 if (value == 0) { ; return; }
1716 if (value == 1 && UseIncDec) { incl(reg) ; return; }
1717 /* else */ { addl(reg, value) ; return; }
1718 }
1719
1720 void MacroAssembler::incrementl(Address dst, int value) {
1721 if (value == min_jint) {addl(dst, value) ; return; }
1722 if (value < 0) { decrementl(dst, -value); return; }
1723 if (value == 0) { ; return; }
1724 if (value == 1 && UseIncDec) { incl(dst) ; return; }
1725 /* else */ { addl(dst, value) ; return; }
1726 }
1727
1728 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
1729 assert(rscratch != noreg || always_reachable(dst), "missing");
1730 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump");
1731 if (reachable(dst)) {
1732 jmp_literal(dst.target(), dst.rspec());
1733 } else {
1734 lea(rscratch, dst);
1735 jmp(rscratch);
1736 }
1737 }
1738
1739 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
1740 assert(rscratch != noreg || always_reachable(dst), "missing");
1741 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump_cc");
1742 if (reachable(dst)) {
1743 InstructionMark im(this);
1744 relocate(dst.reloc());
1745 const int short_size = 2;
1746 const int long_size = 6;
1747 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
1748 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
1749 // 0111 tttn #8-bit disp
1750 emit_int8(0x70 | cc);
1751 emit_int8((offs - short_size) & 0xFF);
1752 } else {
1753 // 0000 1111 1000 tttn #32-bit disp
1754 emit_int8(0x0F);
1755 emit_int8((unsigned char)(0x80 | cc));
1756 emit_int32(offs - long_size);
1757 }
1758 } else {
1759 #ifdef ASSERT
1760 warning("reversing conditional branch");
1761 #endif /* ASSERT */
1762 Label skip;
1763 jccb(reverse[cc], skip);
1764 lea(rscratch, dst);
1765 Assembler::jmp(rscratch);
1766 bind(skip);
1767 }
1768 }
1769
1770 void MacroAssembler::cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch) {
1771 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
1772 assert(rscratch != noreg || always_reachable(mxcsr_std), "missing");
1773
1774 stmxcsr(mxcsr_save);
1775 movl(tmp, mxcsr_save);
1776 if (EnableX86ECoreOpts) {
1777 // The mxcsr_std has status bits set for performance on ECore
1778 orl(tmp, 0x003f);
1779 } else {
1780 // Mask out status bits (only check control and mask bits)
1781 andl(tmp, 0xFFC0);
1782 }
1783 cmp32(tmp, mxcsr_std, rscratch);
1784 }
1785
1786 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
1787 assert(rscratch != noreg || always_reachable(src), "missing");
1788
1789 if (reachable(src)) {
1790 Assembler::ldmxcsr(as_Address(src));
1791 } else {
1792 lea(rscratch, src);
1793 Assembler::ldmxcsr(Address(rscratch, 0));
1794 }
1795 }
1796
1797 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1798 int off = offset();
1799 movsbl(dst, src); // movsxb
1800 return off;
1801 }
1802
1803 // Note: load_signed_short used to be called load_signed_word.
1804 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
1805 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
1806 // The term "word" in HotSpot means a 32- or 64-bit machine word.
1807 int MacroAssembler::load_signed_short(Register dst, Address src) {
1808 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
1809 // version but this is what 64bit has always done. This seems to imply
1810 // that users are only using 32bits worth.
1811 int off = offset();
1812 movswl(dst, src); // movsxw
1813 return off;
1814 }
1815
1816 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1817 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1818 // and "3.9 Partial Register Penalties", p. 22).
1819 int off = offset();
1820 movzbl(dst, src); // movzxb
1821 return off;
1822 }
1823
1824 // Note: load_unsigned_short used to be called load_unsigned_word.
1825 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1826 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1827 // and "3.9 Partial Register Penalties", p. 22).
1828 int off = offset();
1829 movzwl(dst, src); // movzxw
1830 return off;
1831 }
1832
1833 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1834 switch (size_in_bytes) {
1835 case 8: movq(dst, src); break;
1836 case 4: movl(dst, src); break;
1837 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1838 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1839 default: ShouldNotReachHere();
1840 }
1841 }
1842
1843 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1844 switch (size_in_bytes) {
1845 case 8: movq(dst, src); break;
1846 case 4: movl(dst, src); break;
1847 case 2: movw(dst, src); break;
1848 case 1: movb(dst, src); break;
1849 default: ShouldNotReachHere();
1850 }
1851 }
1852
1853 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
1854 assert(rscratch != noreg || always_reachable(dst), "missing");
1855
1856 if (reachable(dst)) {
1857 movl(as_Address(dst), src);
1858 } else {
1859 lea(rscratch, dst);
1860 movl(Address(rscratch, 0), src);
1861 }
1862 }
1863
1864 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
1865 if (reachable(src)) {
1866 movl(dst, as_Address(src));
1867 } else {
1868 lea(dst, src);
1869 movl(dst, Address(dst, 0));
1870 }
1871 }
1872
1873 // C++ bool manipulation
1874
1875 void MacroAssembler::movbool(Register dst, Address src) {
1876 if(sizeof(bool) == 1)
1877 movb(dst, src);
1878 else if(sizeof(bool) == 2)
1879 movw(dst, src);
1880 else if(sizeof(bool) == 4)
1881 movl(dst, src);
1882 else
1883 // unsupported
1884 ShouldNotReachHere();
1885 }
1886
1887 void MacroAssembler::movbool(Address dst, bool boolconst) {
1888 if(sizeof(bool) == 1)
1889 movb(dst, (int) boolconst);
1890 else if(sizeof(bool) == 2)
1891 movw(dst, (int) boolconst);
1892 else if(sizeof(bool) == 4)
1893 movl(dst, (int) boolconst);
1894 else
1895 // unsupported
1896 ShouldNotReachHere();
1897 }
1898
1899 void MacroAssembler::movbool(Address dst, Register src) {
1900 if(sizeof(bool) == 1)
1901 movb(dst, src);
1902 else if(sizeof(bool) == 2)
1903 movw(dst, src);
1904 else if(sizeof(bool) == 4)
1905 movl(dst, src);
1906 else
1907 // unsupported
1908 ShouldNotReachHere();
1909 }
1910
1911 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1912 assert(rscratch != noreg || always_reachable(src), "missing");
1913
1914 if (reachable(src)) {
1915 movdl(dst, as_Address(src));
1916 } else {
1917 lea(rscratch, src);
1918 movdl(dst, Address(rscratch, 0));
1919 }
1920 }
1921
1922 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
1923 assert(rscratch != noreg || always_reachable(src), "missing");
1924
1925 if (reachable(src)) {
1926 movq(dst, as_Address(src));
1927 } else {
1928 lea(rscratch, src);
1929 movq(dst, Address(rscratch, 0));
1930 }
1931 }
1932
1933 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1934 assert(rscratch != noreg || always_reachable(src), "missing");
1935
1936 if (reachable(src)) {
1937 if (UseXmmLoadAndClearUpper) {
1938 movsd (dst, as_Address(src));
1939 } else {
1940 movlpd(dst, as_Address(src));
1941 }
1942 } else {
1943 lea(rscratch, src);
1944 if (UseXmmLoadAndClearUpper) {
1945 movsd (dst, Address(rscratch, 0));
1946 } else {
1947 movlpd(dst, Address(rscratch, 0));
1948 }
1949 }
1950 }
1951
1952 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
1953 assert(rscratch != noreg || always_reachable(src), "missing");
1954
1955 if (reachable(src)) {
1956 movss(dst, as_Address(src));
1957 } else {
1958 lea(rscratch, src);
1959 movss(dst, Address(rscratch, 0));
1960 }
1961 }
1962
1963 void MacroAssembler::movhlf(XMMRegister dst, XMMRegister src, Register rscratch) {
1964 if (VM_Version::supports_avx10_2()) {
1965 evmovw(dst, src);
1966 } else {
1967 assert(rscratch != noreg, "missing");
1968 evmovw(rscratch, src);
1969 evmovw(dst, rscratch);
1970 }
1971 }
1972
1973 void MacroAssembler::mov64(Register dst, int64_t imm64) {
1974 if (is_uimm32(imm64)) {
1975 movl(dst, checked_cast<uint32_t>(imm64));
1976 } else if (is_simm32(imm64)) {
1977 movq(dst, checked_cast<int32_t>(imm64));
1978 } else {
1979 Assembler::mov64(dst, imm64);
1980 }
1981 }
1982
1983 void MacroAssembler::mov64(Register dst, int64_t imm64, relocInfo::relocType rtype, int format) {
1984 Assembler::mov64(dst, imm64, rtype, format);
1985 }
1986
1987 void MacroAssembler::movptr(Register dst, Register src) {
1988 movq(dst, src);
1989 }
1990
1991 void MacroAssembler::movptr(Register dst, Address src) {
1992 movq(dst, src);
1993 }
1994
1995 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
1996 void MacroAssembler::movptr(Register dst, intptr_t src) {
1997 mov64(dst, src);
1998 }
1999
2000 void MacroAssembler::movptr(Address dst, Register src) {
2001 movq(dst, src);
2002 }
2003
2004 void MacroAssembler::movptr(Address dst, int32_t src) {
2005 movslq(dst, src);
2006 }
2007
2008 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2009 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2010 Assembler::movdqu(dst, src);
2011 }
2012
2013 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2014 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2015 Assembler::movdqu(dst, src);
2016 }
2017
2018 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2019 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2020 Assembler::movdqu(dst, src);
2021 }
2022
2023 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2024 assert(rscratch != noreg || always_reachable(src), "missing");
2025
2026 if (reachable(src)) {
2027 movdqu(dst, as_Address(src));
2028 } else {
2029 lea(rscratch, src);
2030 movdqu(dst, Address(rscratch, 0));
2031 }
2032 }
2033
2034 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2035 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2036 Assembler::vmovdqu(dst, src);
2037 }
2038
2039 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2040 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2041 Assembler::vmovdqu(dst, src);
2042 }
2043
2044 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2045 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2046 Assembler::vmovdqu(dst, src);
2047 }
2048
2049 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2050 assert(rscratch != noreg || always_reachable(src), "missing");
2051
2052 if (reachable(src)) {
2053 vmovdqu(dst, as_Address(src));
2054 }
2055 else {
2056 lea(rscratch, src);
2057 vmovdqu(dst, Address(rscratch, 0));
2058 }
2059 }
2060
2061 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2062 assert(rscratch != noreg || always_reachable(src), "missing");
2063
2064 if (vector_len == AVX_512bit) {
2065 evmovdquq(dst, src, AVX_512bit, rscratch);
2066 } else if (vector_len == AVX_256bit) {
2067 vmovdqu(dst, src, rscratch);
2068 } else {
2069 movdqu(dst, src, rscratch);
2070 }
2071 }
2072
2073 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
2074 if (vector_len == AVX_512bit) {
2075 evmovdquq(dst, src, AVX_512bit);
2076 } else if (vector_len == AVX_256bit) {
2077 vmovdqu(dst, src);
2078 } else {
2079 movdqu(dst, src);
2080 }
2081 }
2082
2083 void MacroAssembler::vmovdqu(Address dst, XMMRegister src, int vector_len) {
2084 if (vector_len == AVX_512bit) {
2085 evmovdquq(dst, src, AVX_512bit);
2086 } else if (vector_len == AVX_256bit) {
2087 vmovdqu(dst, src);
2088 } else {
2089 movdqu(dst, src);
2090 }
2091 }
2092
2093 void MacroAssembler::vmovdqu(XMMRegister dst, Address src, int vector_len) {
2094 if (vector_len == AVX_512bit) {
2095 evmovdquq(dst, src, AVX_512bit);
2096 } else if (vector_len == AVX_256bit) {
2097 vmovdqu(dst, src);
2098 } else {
2099 movdqu(dst, src);
2100 }
2101 }
2102
2103 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2104 assert(rscratch != noreg || always_reachable(src), "missing");
2105
2106 if (reachable(src)) {
2107 vmovdqa(dst, as_Address(src));
2108 }
2109 else {
2110 lea(rscratch, src);
2111 vmovdqa(dst, Address(rscratch, 0));
2112 }
2113 }
2114
2115 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2116 assert(rscratch != noreg || always_reachable(src), "missing");
2117
2118 if (vector_len == AVX_512bit) {
2119 evmovdqaq(dst, src, AVX_512bit, rscratch);
2120 } else if (vector_len == AVX_256bit) {
2121 vmovdqa(dst, src, rscratch);
2122 } else {
2123 movdqa(dst, src, rscratch);
2124 }
2125 }
2126
2127 void MacroAssembler::kmov(KRegister dst, Address src) {
2128 if (VM_Version::supports_avx512bw()) {
2129 kmovql(dst, src);
2130 } else {
2131 assert(VM_Version::supports_evex(), "");
2132 kmovwl(dst, src);
2133 }
2134 }
2135
2136 void MacroAssembler::kmov(Address dst, KRegister src) {
2137 if (VM_Version::supports_avx512bw()) {
2138 kmovql(dst, src);
2139 } else {
2140 assert(VM_Version::supports_evex(), "");
2141 kmovwl(dst, src);
2142 }
2143 }
2144
2145 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2146 if (VM_Version::supports_avx512bw()) {
2147 kmovql(dst, src);
2148 } else {
2149 assert(VM_Version::supports_evex(), "");
2150 kmovwl(dst, src);
2151 }
2152 }
2153
2154 void MacroAssembler::kmov(Register dst, KRegister src) {
2155 if (VM_Version::supports_avx512bw()) {
2156 kmovql(dst, src);
2157 } else {
2158 assert(VM_Version::supports_evex(), "");
2159 kmovwl(dst, src);
2160 }
2161 }
2162
2163 void MacroAssembler::kmov(KRegister dst, Register src) {
2164 if (VM_Version::supports_avx512bw()) {
2165 kmovql(dst, src);
2166 } else {
2167 assert(VM_Version::supports_evex(), "");
2168 kmovwl(dst, src);
2169 }
2170 }
2171
2172 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2173 assert(rscratch != noreg || always_reachable(src), "missing");
2174
2175 if (reachable(src)) {
2176 kmovql(dst, as_Address(src));
2177 } else {
2178 lea(rscratch, src);
2179 kmovql(dst, Address(rscratch, 0));
2180 }
2181 }
2182
2183 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2184 assert(rscratch != noreg || always_reachable(src), "missing");
2185
2186 if (reachable(src)) {
2187 kmovwl(dst, as_Address(src));
2188 } else {
2189 lea(rscratch, src);
2190 kmovwl(dst, Address(rscratch, 0));
2191 }
2192 }
2193
2194 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2195 int vector_len, Register rscratch) {
2196 assert(rscratch != noreg || always_reachable(src), "missing");
2197
2198 if (reachable(src)) {
2199 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2200 } else {
2201 lea(rscratch, src);
2202 Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2203 }
2204 }
2205
2206 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2207 int vector_len, Register rscratch) {
2208 assert(rscratch != noreg || always_reachable(src), "missing");
2209
2210 if (reachable(src)) {
2211 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2212 } else {
2213 lea(rscratch, src);
2214 Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2215 }
2216 }
2217
2218 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2219 assert(rscratch != noreg || always_reachable(src), "missing");
2220
2221 if (reachable(src)) {
2222 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2223 } else {
2224 lea(rscratch, src);
2225 Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2226 }
2227 }
2228
2229 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2230 assert(rscratch != noreg || always_reachable(src), "missing");
2231
2232 if (reachable(src)) {
2233 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2234 } else {
2235 lea(rscratch, src);
2236 Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2237 }
2238 }
2239
2240 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2241 assert(rscratch != noreg || always_reachable(src), "missing");
2242
2243 if (reachable(src)) {
2244 Assembler::evmovdquq(dst, as_Address(src), vector_len);
2245 } else {
2246 lea(rscratch, src);
2247 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2248 }
2249 }
2250
2251 void MacroAssembler::evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2252 assert(rscratch != noreg || always_reachable(src), "missing");
2253
2254 if (reachable(src)) {
2255 Assembler::evmovdqaq(dst, mask, as_Address(src), merge, vector_len);
2256 } else {
2257 lea(rscratch, src);
2258 Assembler::evmovdqaq(dst, mask, Address(rscratch, 0), merge, vector_len);
2259 }
2260 }
2261
2262 void MacroAssembler::evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2263 assert(rscratch != noreg || always_reachable(src), "missing");
2264
2265 if (reachable(src)) {
2266 Assembler::evmovdqaq(dst, as_Address(src), vector_len);
2267 } else {
2268 lea(rscratch, src);
2269 Assembler::evmovdqaq(dst, Address(rscratch, 0), vector_len);
2270 }
2271 }
2272
2273 void MacroAssembler::movapd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2274 assert(rscratch != noreg || always_reachable(src), "missing");
2275
2276 if (reachable(src)) {
2277 Assembler::movapd(dst, as_Address(src));
2278 } else {
2279 lea(rscratch, src);
2280 Assembler::movapd(dst, Address(rscratch, 0));
2281 }
2282 }
2283
2284 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2285 assert(rscratch != noreg || always_reachable(src), "missing");
2286
2287 if (reachable(src)) {
2288 Assembler::movdqa(dst, as_Address(src));
2289 } else {
2290 lea(rscratch, src);
2291 Assembler::movdqa(dst, Address(rscratch, 0));
2292 }
2293 }
2294
2295 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2296 assert(rscratch != noreg || always_reachable(src), "missing");
2297
2298 if (reachable(src)) {
2299 Assembler::movsd(dst, as_Address(src));
2300 } else {
2301 lea(rscratch, src);
2302 Assembler::movsd(dst, Address(rscratch, 0));
2303 }
2304 }
2305
2306 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2307 assert(rscratch != noreg || always_reachable(src), "missing");
2308
2309 if (reachable(src)) {
2310 Assembler::movss(dst, as_Address(src));
2311 } else {
2312 lea(rscratch, src);
2313 Assembler::movss(dst, Address(rscratch, 0));
2314 }
2315 }
2316
2317 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2318 assert(rscratch != noreg || always_reachable(src), "missing");
2319
2320 if (reachable(src)) {
2321 Assembler::movddup(dst, as_Address(src));
2322 } else {
2323 lea(rscratch, src);
2324 Assembler::movddup(dst, Address(rscratch, 0));
2325 }
2326 }
2327
2328 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2329 assert(rscratch != noreg || always_reachable(src), "missing");
2330
2331 if (reachable(src)) {
2332 Assembler::vmovddup(dst, as_Address(src), vector_len);
2333 } else {
2334 lea(rscratch, src);
2335 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2336 }
2337 }
2338
2339 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2340 assert(rscratch != noreg || always_reachable(src), "missing");
2341
2342 if (reachable(src)) {
2343 Assembler::mulsd(dst, as_Address(src));
2344 } else {
2345 lea(rscratch, src);
2346 Assembler::mulsd(dst, Address(rscratch, 0));
2347 }
2348 }
2349
2350 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2351 assert(rscratch != noreg || always_reachable(src), "missing");
2352
2353 if (reachable(src)) {
2354 Assembler::mulss(dst, as_Address(src));
2355 } else {
2356 lea(rscratch, src);
2357 Assembler::mulss(dst, Address(rscratch, 0));
2358 }
2359 }
2360
2361 void MacroAssembler::null_check(Register reg, int offset) {
2362 if (needs_explicit_null_check(offset)) {
2363 // provoke OS null exception if reg is null by
2364 // accessing M[reg] w/o changing any (non-CC) registers
2365 // NOTE: cmpl is plenty here to provoke a segv
2366 cmpptr(rax, Address(reg, 0));
2367 // Note: should probably use testl(rax, Address(reg, 0));
2368 // may be shorter code (however, this version of
2369 // testl needs to be implemented first)
2370 } else {
2371 // nothing to do, (later) access of M[reg + offset]
2372 // will provoke OS null exception if reg is null
2373 }
2374 }
2375
2376 void MacroAssembler::os_breakpoint() {
2377 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2378 // (e.g., MSVC can't call ps() otherwise)
2379 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2380 }
2381
2382 void MacroAssembler::unimplemented(const char* what) {
2383 const char* buf = nullptr;
2384 {
2385 ResourceMark rm;
2386 stringStream ss;
2387 ss.print("unimplemented: %s", what);
2388 buf = code_string(ss.as_string());
2389 }
2390 stop(buf);
2391 }
2392
2393 #define XSTATE_BV 0x200
2394
2395 void MacroAssembler::pop_CPU_state() {
2396 pop_FPU_state();
2397 pop_IU_state();
2398 }
2399
2400 void MacroAssembler::pop_FPU_state() {
2401 fxrstor(Address(rsp, 0));
2402 addptr(rsp, FPUStateSizeInWords * wordSize);
2403 }
2404
2405 void MacroAssembler::pop_IU_state() {
2406 popa();
2407 addq(rsp, 8);
2408 popf();
2409 }
2410
2411 // Save Integer and Float state
2412 // Warning: Stack must be 16 byte aligned (64bit)
2413 void MacroAssembler::push_CPU_state() {
2414 push_IU_state();
2415 push_FPU_state();
2416 }
2417
2418 void MacroAssembler::push_FPU_state() {
2419 subptr(rsp, FPUStateSizeInWords * wordSize);
2420 fxsave(Address(rsp, 0));
2421 }
2422
2423 void MacroAssembler::push_IU_state() {
2424 // Push flags first because pusha kills them
2425 pushf();
2426 // Make sure rsp stays 16-byte aligned
2427 subq(rsp, 8);
2428 pusha();
2429 }
2430
2431 void MacroAssembler::push_cont_fastpath() {
2432 if (!Continuations::enabled()) return;
2433
2434 Label L_done;
2435 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2436 jccb(Assembler::belowEqual, L_done);
2437 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rsp);
2438 bind(L_done);
2439 }
2440
2441 void MacroAssembler::pop_cont_fastpath() {
2442 if (!Continuations::enabled()) return;
2443
2444 Label L_done;
2445 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2446 jccb(Assembler::below, L_done);
2447 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
2448 bind(L_done);
2449 }
2450
2451 #ifdef ASSERT
2452 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
2453 Label no_cont;
2454 movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
2455 testl(cont, cont);
2456 jcc(Assembler::zero, no_cont);
2457 stop(name);
2458 bind(no_cont);
2459 }
2460 #endif
2461
2462 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { // determine java_thread register
2463 // we must set sp to zero to clear frame
2464 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2465 // must clear fp, so that compiled frames are not confused; it is
2466 // possible that we need it only for debugging
2467 if (clear_fp) {
2468 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2469 }
2470 // Always clear the pc because it could have been set by make_walkable()
2471 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2472 vzeroupper();
2473 }
2474
2475 void MacroAssembler::round_to(Register reg, int modulus) {
2476 addptr(reg, modulus - 1);
2477 andptr(reg, -modulus);
2478 }
2479
2480 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod) {
2481 if (at_return) {
2482 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2483 // we may safely use rsp instead to perform the stack watermark check.
2484 cmpptr(in_nmethod ? rsp : rbp, Address(r15_thread, JavaThread::polling_word_offset()));
2485 jcc(Assembler::above, slow_path);
2486 return;
2487 }
2488 testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2489 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2490 }
2491
2492 // Calls to C land
2493 //
2494 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2495 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2496 // has to be reset to 0. This is required to allow proper stack traversal.
2497 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2498 Register last_java_fp,
2499 address last_java_pc,
2500 Register rscratch) {
2501 vzeroupper();
2502 // determine last_java_sp register
2503 if (!last_java_sp->is_valid()) {
2504 last_java_sp = rsp;
2505 }
2506 // last_java_fp is optional
2507 if (last_java_fp->is_valid()) {
2508 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2509 }
2510 // last_java_pc is optional
2511 if (last_java_pc != nullptr) {
2512 Address java_pc(r15_thread,
2513 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
2514 lea(java_pc, InternalAddress(last_java_pc), rscratch);
2515 }
2516 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2517 }
2518
2519 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2520 Register last_java_fp,
2521 Label &L,
2522 Register scratch) {
2523 lea(scratch, L);
2524 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), scratch);
2525 set_last_Java_frame(last_java_sp, last_java_fp, nullptr, scratch);
2526 }
2527
2528 void MacroAssembler::shlptr(Register dst, int imm8) {
2529 shlq(dst, imm8);
2530 }
2531
2532 void MacroAssembler::shrptr(Register dst, int imm8) {
2533 shrq(dst, imm8);
2534 }
2535
2536 void MacroAssembler::sign_extend_byte(Register reg) {
2537 movsbl(reg, reg); // movsxb
2538 }
2539
2540 void MacroAssembler::sign_extend_short(Register reg) {
2541 movswl(reg, reg); // movsxw
2542 }
2543
2544 void MacroAssembler::narrow_subword_type(Register reg, BasicType bt) {
2545 assert(is_subword_type(bt), "required");
2546 switch (bt) {
2547 case T_BOOLEAN: andl(reg, 1); break;
2548 case T_BYTE: movsbl(reg, reg); break;
2549 case T_CHAR: movzwl(reg, reg); break;
2550 case T_SHORT: movswl(reg, reg); break;
2551 default: ShouldNotReachHere();
2552 }
2553 }
2554
2555 void MacroAssembler::testl(Address dst, int32_t imm32) {
2556 if (imm32 >= 0 && is8bit(imm32)) {
2557 testb(dst, imm32);
2558 } else {
2559 Assembler::testl(dst, imm32);
2560 }
2561 }
2562
2563 void MacroAssembler::testl(Register dst, int32_t imm32) {
2564 if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
2565 testb(dst, imm32);
2566 } else {
2567 Assembler::testl(dst, imm32);
2568 }
2569 }
2570
2571 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2572 assert(always_reachable(src), "Address should be reachable");
2573 testl(dst, as_Address(src));
2574 }
2575
2576 void MacroAssembler::testq(Address dst, int32_t imm32) {
2577 if (imm32 >= 0) {
2578 testl(dst, imm32);
2579 } else {
2580 Assembler::testq(dst, imm32);
2581 }
2582 }
2583
2584 void MacroAssembler::testq(Register dst, int32_t imm32) {
2585 if (imm32 >= 0) {
2586 testl(dst, imm32);
2587 } else {
2588 Assembler::testq(dst, imm32);
2589 }
2590 }
2591
2592 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2593 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2594 Assembler::pcmpeqb(dst, src);
2595 }
2596
2597 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2598 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2599 Assembler::pcmpeqw(dst, src);
2600 }
2601
2602 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2603 assert((dst->encoding() < 16),"XMM register should be 0-15");
2604 Assembler::pcmpestri(dst, src, imm8);
2605 }
2606
2607 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2608 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2609 Assembler::pcmpestri(dst, src, imm8);
2610 }
2611
2612 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2613 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2614 Assembler::pmovzxbw(dst, src);
2615 }
2616
2617 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2618 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2619 Assembler::pmovzxbw(dst, src);
2620 }
2621
2622 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2623 assert((src->encoding() < 16),"XMM register should be 0-15");
2624 Assembler::pmovmskb(dst, src);
2625 }
2626
2627 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2628 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2629 Assembler::ptest(dst, src);
2630 }
2631
2632 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2633 assert(rscratch != noreg || always_reachable(src), "missing");
2634
2635 if (reachable(src)) {
2636 Assembler::sqrtss(dst, as_Address(src));
2637 } else {
2638 lea(rscratch, src);
2639 Assembler::sqrtss(dst, Address(rscratch, 0));
2640 }
2641 }
2642
2643 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2644 assert(rscratch != noreg || always_reachable(src), "missing");
2645
2646 if (reachable(src)) {
2647 Assembler::subsd(dst, as_Address(src));
2648 } else {
2649 lea(rscratch, src);
2650 Assembler::subsd(dst, Address(rscratch, 0));
2651 }
2652 }
2653
2654 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
2655 assert(rscratch != noreg || always_reachable(src), "missing");
2656
2657 if (reachable(src)) {
2658 Assembler::roundsd(dst, as_Address(src), rmode);
2659 } else {
2660 lea(rscratch, src);
2661 Assembler::roundsd(dst, Address(rscratch, 0), rmode);
2662 }
2663 }
2664
2665 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2666 assert(rscratch != noreg || always_reachable(src), "missing");
2667
2668 if (reachable(src)) {
2669 Assembler::subss(dst, as_Address(src));
2670 } else {
2671 lea(rscratch, src);
2672 Assembler::subss(dst, Address(rscratch, 0));
2673 }
2674 }
2675
2676 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2677 assert(rscratch != noreg || always_reachable(src), "missing");
2678
2679 if (reachable(src)) {
2680 Assembler::ucomisd(dst, as_Address(src));
2681 } else {
2682 lea(rscratch, src);
2683 Assembler::ucomisd(dst, Address(rscratch, 0));
2684 }
2685 }
2686
2687 void MacroAssembler::evucomxsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2688 assert(rscratch != noreg || always_reachable(src), "missing");
2689
2690 if (reachable(src)) {
2691 Assembler::evucomxsd(dst, as_Address(src));
2692 } else {
2693 lea(rscratch, src);
2694 Assembler::evucomxsd(dst, Address(rscratch, 0));
2695 }
2696 }
2697
2698 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2699 assert(rscratch != noreg || always_reachable(src), "missing");
2700
2701 if (reachable(src)) {
2702 Assembler::ucomiss(dst, as_Address(src));
2703 } else {
2704 lea(rscratch, src);
2705 Assembler::ucomiss(dst, Address(rscratch, 0));
2706 }
2707 }
2708
2709 void MacroAssembler::evucomxss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2710 assert(rscratch != noreg || always_reachable(src), "missing");
2711
2712 if (reachable(src)) {
2713 Assembler::evucomxss(dst, as_Address(src));
2714 } else {
2715 lea(rscratch, src);
2716 Assembler::evucomxss(dst, Address(rscratch, 0));
2717 }
2718 }
2719
2720 void MacroAssembler::evucomish(XMMRegister dst, AddressLiteral src, Register rscratch) {
2721 assert(rscratch != noreg || always_reachable(src), "missing");
2722
2723 if (reachable(src)) {
2724 Assembler::evucomish(dst, as_Address(src));
2725 } else {
2726 lea(rscratch, src);
2727 Assembler::evucomish(dst, Address(rscratch, 0));
2728 }
2729 }
2730
2731 void MacroAssembler::evucomxsh(XMMRegister dst, AddressLiteral src, Register rscratch) {
2732 assert(rscratch != noreg || always_reachable(src), "missing");
2733
2734 if (reachable(src)) {
2735 Assembler::evucomxsh(dst, as_Address(src));
2736 } else {
2737 lea(rscratch, src);
2738 Assembler::evucomxsh(dst, Address(rscratch, 0));
2739 }
2740 }
2741
2742 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2743 assert(rscratch != noreg || always_reachable(src), "missing");
2744
2745 // Used in sign-bit flipping with aligned address.
2746 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2747
2748 if (UseAVX > 2 &&
2749 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2750 (dst->encoding() >= 16)) {
2751 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2752 } else if (reachable(src)) {
2753 Assembler::xorpd(dst, as_Address(src));
2754 } else {
2755 lea(rscratch, src);
2756 Assembler::xorpd(dst, Address(rscratch, 0));
2757 }
2758 }
2759
2760 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
2761 if (UseAVX > 2 &&
2762 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2763 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2764 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2765 } else {
2766 Assembler::xorpd(dst, src);
2767 }
2768 }
2769
2770 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
2771 if (UseAVX > 2 &&
2772 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2773 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2774 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2775 } else {
2776 Assembler::xorps(dst, src);
2777 }
2778 }
2779
2780 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
2781 assert(rscratch != noreg || always_reachable(src), "missing");
2782
2783 // Used in sign-bit flipping with aligned address.
2784 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2785
2786 if (UseAVX > 2 &&
2787 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2788 (dst->encoding() >= 16)) {
2789 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2790 } else if (reachable(src)) {
2791 Assembler::xorps(dst, as_Address(src));
2792 } else {
2793 lea(rscratch, src);
2794 Assembler::xorps(dst, Address(rscratch, 0));
2795 }
2796 }
2797
2798 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
2799 assert(rscratch != noreg || always_reachable(src), "missing");
2800
2801 // Used in sign-bit flipping with aligned address.
2802 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
2803 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
2804 if (reachable(src)) {
2805 Assembler::pshufb(dst, as_Address(src));
2806 } else {
2807 lea(rscratch, src);
2808 Assembler::pshufb(dst, Address(rscratch, 0));
2809 }
2810 }
2811
2812 // AVX 3-operands instructions
2813
2814 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2815 assert(rscratch != noreg || always_reachable(src), "missing");
2816
2817 if (reachable(src)) {
2818 vaddsd(dst, nds, as_Address(src));
2819 } else {
2820 lea(rscratch, src);
2821 vaddsd(dst, nds, Address(rscratch, 0));
2822 }
2823 }
2824
2825 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2826 assert(rscratch != noreg || always_reachable(src), "missing");
2827
2828 if (reachable(src)) {
2829 vaddss(dst, nds, as_Address(src));
2830 } else {
2831 lea(rscratch, src);
2832 vaddss(dst, nds, Address(rscratch, 0));
2833 }
2834 }
2835
2836 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2837 assert(UseAVX > 0, "requires some form of AVX");
2838 assert(rscratch != noreg || always_reachable(src), "missing");
2839
2840 if (reachable(src)) {
2841 Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
2842 } else {
2843 lea(rscratch, src);
2844 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
2845 }
2846 }
2847
2848 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2849 assert(UseAVX > 0, "requires some form of AVX");
2850 assert(rscratch != noreg || always_reachable(src), "missing");
2851
2852 if (reachable(src)) {
2853 Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
2854 } else {
2855 lea(rscratch, src);
2856 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
2857 }
2858 }
2859
2860 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2861 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2862 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2863
2864 vandps(dst, nds, negate_field, vector_len, rscratch);
2865 }
2866
2867 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2868 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2869 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2870
2871 vandpd(dst, nds, negate_field, vector_len, rscratch);
2872 }
2873
2874 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2875 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2876 Assembler::vpaddb(dst, nds, src, vector_len);
2877 }
2878
2879 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2880 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2881 Assembler::vpaddb(dst, nds, src, vector_len);
2882 }
2883
2884 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2885 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2886 Assembler::vpaddw(dst, nds, src, vector_len);
2887 }
2888
2889 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2890 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2891 Assembler::vpaddw(dst, nds, src, vector_len);
2892 }
2893
2894 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2895 assert(rscratch != noreg || always_reachable(src), "missing");
2896
2897 if (reachable(src)) {
2898 Assembler::vpand(dst, nds, as_Address(src), vector_len);
2899 } else {
2900 lea(rscratch, src);
2901 Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
2902 }
2903 }
2904
2905 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2906 assert(rscratch != noreg || always_reachable(src), "missing");
2907
2908 if (reachable(src)) {
2909 Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
2910 } else {
2911 lea(rscratch, src);
2912 Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
2913 }
2914 }
2915
2916 void MacroAssembler::vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2917 assert(rscratch != noreg || always_reachable(src), "missing");
2918
2919 if (reachable(src)) {
2920 Assembler::vbroadcasti128(dst, as_Address(src), vector_len);
2921 } else {
2922 lea(rscratch, src);
2923 Assembler::vbroadcasti128(dst, Address(rscratch, 0), vector_len);
2924 }
2925 }
2926
2927 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2928 assert(rscratch != noreg || always_reachable(src), "missing");
2929
2930 if (reachable(src)) {
2931 Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
2932 } else {
2933 lea(rscratch, src);
2934 Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
2935 }
2936 }
2937
2938 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2939 assert(rscratch != noreg || always_reachable(src), "missing");
2940
2941 if (reachable(src)) {
2942 Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
2943 } else {
2944 lea(rscratch, src);
2945 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
2946 }
2947 }
2948
2949 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2950 assert(rscratch != noreg || always_reachable(src), "missing");
2951
2952 if (reachable(src)) {
2953 Assembler::vbroadcastss(dst, as_Address(src), vector_len);
2954 } else {
2955 lea(rscratch, src);
2956 Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
2957 }
2958 }
2959
2960 // Vector float blend
2961 // vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
2962 void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
2963 // WARN: Allow dst == (src1|src2), mask == scratch
2964 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
2965 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
2966 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst;
2967 bool dst_available = dst != mask && (dst != src1 || dst != src2);
2968 if (blend_emulation && scratch_available && dst_available) {
2969 if (compute_mask) {
2970 vpsrad(scratch, mask, 32, vector_len);
2971 mask = scratch;
2972 }
2973 if (dst == src1) {
2974 vpandn(dst, mask, src1, vector_len); // if mask == 0, src1
2975 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
2976 } else {
2977 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
2978 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1
2979 }
2980 vpor(dst, dst, scratch, vector_len);
2981 } else {
2982 Assembler::vblendvps(dst, src1, src2, mask, vector_len);
2983 }
2984 }
2985
2986 // vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
2987 void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
2988 // WARN: Allow dst == (src1|src2), mask == scratch
2989 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
2990 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
2991 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask);
2992 bool dst_available = dst != mask && (dst != src1 || dst != src2);
2993 if (blend_emulation && scratch_available && dst_available) {
2994 if (compute_mask) {
2995 vpxor(scratch, scratch, scratch, vector_len);
2996 vpcmpgtq(scratch, scratch, mask, vector_len);
2997 mask = scratch;
2998 }
2999 if (dst == src1) {
3000 vpandn(dst, mask, src1, vector_len); // if mask == 0, src
3001 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3002 } else {
3003 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
3004 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src
3005 }
3006 vpor(dst, dst, scratch, vector_len);
3007 } else {
3008 Assembler::vblendvpd(dst, src1, src2, mask, vector_len);
3009 }
3010 }
3011
3012 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3013 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3014 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3015 }
3016
3017 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
3018 assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3019 Assembler::vpcmpeqb(dst, src1, src2, vector_len);
3020 }
3021
3022 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3023 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3024 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3025 }
3026
3027 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3028 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3029 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3030 }
3031
3032 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3033 assert(rscratch != noreg || always_reachable(src), "missing");
3034
3035 if (reachable(src)) {
3036 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3037 } else {
3038 lea(rscratch, src);
3039 Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3040 }
3041 }
3042
3043 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3044 int comparison, bool is_signed, int vector_len, Register rscratch) {
3045 assert(rscratch != noreg || always_reachable(src), "missing");
3046
3047 if (reachable(src)) {
3048 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3049 } else {
3050 lea(rscratch, src);
3051 Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3052 }
3053 }
3054
3055 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3056 int comparison, bool is_signed, int vector_len, Register rscratch) {
3057 assert(rscratch != noreg || always_reachable(src), "missing");
3058
3059 if (reachable(src)) {
3060 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3061 } else {
3062 lea(rscratch, src);
3063 Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3064 }
3065 }
3066
3067 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3068 int comparison, bool is_signed, int vector_len, Register rscratch) {
3069 assert(rscratch != noreg || always_reachable(src), "missing");
3070
3071 if (reachable(src)) {
3072 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3073 } else {
3074 lea(rscratch, src);
3075 Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3076 }
3077 }
3078
3079 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3080 int comparison, bool is_signed, int vector_len, Register rscratch) {
3081 assert(rscratch != noreg || always_reachable(src), "missing");
3082
3083 if (reachable(src)) {
3084 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3085 } else {
3086 lea(rscratch, src);
3087 Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3088 }
3089 }
3090
3091 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3092 if (width == Assembler::Q) {
3093 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3094 } else {
3095 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3096 }
3097 }
3098
3099 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3100 int eq_cond_enc = 0x29;
3101 int gt_cond_enc = 0x37;
3102 if (width != Assembler::Q) {
3103 eq_cond_enc = 0x74 + width;
3104 gt_cond_enc = 0x64 + width;
3105 }
3106 switch (cond) {
3107 case eq:
3108 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3109 break;
3110 case neq:
3111 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3112 vallones(xtmp, vector_len);
3113 vpxor(dst, xtmp, dst, vector_len);
3114 break;
3115 case le:
3116 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3117 vallones(xtmp, vector_len);
3118 vpxor(dst, xtmp, dst, vector_len);
3119 break;
3120 case nlt:
3121 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3122 vallones(xtmp, vector_len);
3123 vpxor(dst, xtmp, dst, vector_len);
3124 break;
3125 case lt:
3126 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3127 break;
3128 case nle:
3129 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3130 break;
3131 default:
3132 assert(false, "Should not reach here");
3133 }
3134 }
3135
3136 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3137 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3138 Assembler::vpmovzxbw(dst, src, vector_len);
3139 }
3140
3141 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3142 assert((src->encoding() < 16),"XMM register should be 0-15");
3143 Assembler::vpmovmskb(dst, src, vector_len);
3144 }
3145
3146 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3147 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3148 Assembler::vpmullw(dst, nds, src, vector_len);
3149 }
3150
3151 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3152 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3153 Assembler::vpmullw(dst, nds, src, vector_len);
3154 }
3155
3156 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3157 assert((UseAVX > 0), "AVX support is needed");
3158 assert(rscratch != noreg || always_reachable(src), "missing");
3159
3160 if (reachable(src)) {
3161 Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3162 } else {
3163 lea(rscratch, src);
3164 Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3165 }
3166 }
3167
3168 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3169 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3170 Assembler::vpsubb(dst, nds, src, vector_len);
3171 }
3172
3173 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3174 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3175 Assembler::vpsubb(dst, nds, src, vector_len);
3176 }
3177
3178 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3179 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3180 Assembler::vpsubw(dst, nds, src, vector_len);
3181 }
3182
3183 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3184 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3185 Assembler::vpsubw(dst, nds, src, vector_len);
3186 }
3187
3188 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3189 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3190 Assembler::vpsraw(dst, nds, shift, vector_len);
3191 }
3192
3193 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3194 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3195 Assembler::vpsraw(dst, nds, shift, vector_len);
3196 }
3197
3198 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3199 assert(UseAVX > 2,"");
3200 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3201 vector_len = 2;
3202 }
3203 Assembler::evpsraq(dst, nds, shift, vector_len);
3204 }
3205
3206 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3207 assert(UseAVX > 2,"");
3208 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3209 vector_len = 2;
3210 }
3211 Assembler::evpsraq(dst, nds, shift, vector_len);
3212 }
3213
3214 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3215 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3216 Assembler::vpsrlw(dst, nds, shift, vector_len);
3217 }
3218
3219 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3220 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3221 Assembler::vpsrlw(dst, nds, shift, vector_len);
3222 }
3223
3224 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3225 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3226 Assembler::vpsllw(dst, nds, shift, vector_len);
3227 }
3228
3229 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3230 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3231 Assembler::vpsllw(dst, nds, shift, vector_len);
3232 }
3233
3234 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3235 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3236 Assembler::vptest(dst, src);
3237 }
3238
3239 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3240 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3241 Assembler::punpcklbw(dst, src);
3242 }
3243
3244 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3245 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3246 Assembler::pshufd(dst, src, mode);
3247 }
3248
3249 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3250 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3251 Assembler::pshuflw(dst, src, mode);
3252 }
3253
3254 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3255 assert(rscratch != noreg || always_reachable(src), "missing");
3256
3257 if (reachable(src)) {
3258 vandpd(dst, nds, as_Address(src), vector_len);
3259 } else {
3260 lea(rscratch, src);
3261 vandpd(dst, nds, Address(rscratch, 0), vector_len);
3262 }
3263 }
3264
3265 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3266 assert(rscratch != noreg || always_reachable(src), "missing");
3267
3268 if (reachable(src)) {
3269 vandps(dst, nds, as_Address(src), vector_len);
3270 } else {
3271 lea(rscratch, src);
3272 vandps(dst, nds, Address(rscratch, 0), vector_len);
3273 }
3274 }
3275
3276 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3277 bool merge, int vector_len, Register rscratch) {
3278 assert(rscratch != noreg || always_reachable(src), "missing");
3279
3280 if (reachable(src)) {
3281 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3282 } else {
3283 lea(rscratch, src);
3284 Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3285 }
3286 }
3287
3288 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3289 assert(rscratch != noreg || always_reachable(src), "missing");
3290
3291 if (reachable(src)) {
3292 vdivsd(dst, nds, as_Address(src));
3293 } else {
3294 lea(rscratch, src);
3295 vdivsd(dst, nds, Address(rscratch, 0));
3296 }
3297 }
3298
3299 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3300 assert(rscratch != noreg || always_reachable(src), "missing");
3301
3302 if (reachable(src)) {
3303 vdivss(dst, nds, as_Address(src));
3304 } else {
3305 lea(rscratch, src);
3306 vdivss(dst, nds, Address(rscratch, 0));
3307 }
3308 }
3309
3310 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3311 assert(rscratch != noreg || always_reachable(src), "missing");
3312
3313 if (reachable(src)) {
3314 vmulsd(dst, nds, as_Address(src));
3315 } else {
3316 lea(rscratch, src);
3317 vmulsd(dst, nds, Address(rscratch, 0));
3318 }
3319 }
3320
3321 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3322 assert(rscratch != noreg || always_reachable(src), "missing");
3323
3324 if (reachable(src)) {
3325 vmulss(dst, nds, as_Address(src));
3326 } else {
3327 lea(rscratch, src);
3328 vmulss(dst, nds, Address(rscratch, 0));
3329 }
3330 }
3331
3332 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3333 assert(rscratch != noreg || always_reachable(src), "missing");
3334
3335 if (reachable(src)) {
3336 vsubsd(dst, nds, as_Address(src));
3337 } else {
3338 lea(rscratch, src);
3339 vsubsd(dst, nds, Address(rscratch, 0));
3340 }
3341 }
3342
3343 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3344 assert(rscratch != noreg || always_reachable(src), "missing");
3345
3346 if (reachable(src)) {
3347 vsubss(dst, nds, as_Address(src));
3348 } else {
3349 lea(rscratch, src);
3350 vsubss(dst, nds, Address(rscratch, 0));
3351 }
3352 }
3353
3354 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3355 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3356 assert(rscratch != noreg || always_reachable(src), "missing");
3357
3358 vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3359 }
3360
3361 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3362 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3363 assert(rscratch != noreg || always_reachable(src), "missing");
3364
3365 vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3366 }
3367
3368 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3369 assert(rscratch != noreg || always_reachable(src), "missing");
3370
3371 if (reachable(src)) {
3372 vxorpd(dst, nds, as_Address(src), vector_len);
3373 } else {
3374 lea(rscratch, src);
3375 vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3376 }
3377 }
3378
3379 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3380 assert(rscratch != noreg || always_reachable(src), "missing");
3381
3382 if (reachable(src)) {
3383 vxorps(dst, nds, as_Address(src), vector_len);
3384 } else {
3385 lea(rscratch, src);
3386 vxorps(dst, nds, Address(rscratch, 0), vector_len);
3387 }
3388 }
3389
3390 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3391 assert(rscratch != noreg || always_reachable(src), "missing");
3392
3393 if (UseAVX > 1 || (vector_len < 1)) {
3394 if (reachable(src)) {
3395 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3396 } else {
3397 lea(rscratch, src);
3398 Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3399 }
3400 } else {
3401 MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3402 }
3403 }
3404
3405 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3406 assert(rscratch != noreg || always_reachable(src), "missing");
3407
3408 if (reachable(src)) {
3409 Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3410 } else {
3411 lea(rscratch, src);
3412 Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3413 }
3414 }
3415
3416 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3417 const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3418 STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3419 // The inverted mask is sign-extended
3420 andptr(possibly_non_local, inverted_mask);
3421 }
3422
3423 void MacroAssembler::resolve_jobject(Register value,
3424 Register tmp) {
3425 Register thread = r15_thread;
3426 assert_different_registers(value, thread, tmp);
3427 Label done, tagged, weak_tagged;
3428 testptr(value, value);
3429 jcc(Assembler::zero, done); // Use null as-is.
3430 testptr(value, JNIHandles::tag_mask); // Test for tag.
3431 jcc(Assembler::notZero, tagged);
3432
3433 // Resolve local handle
3434 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp);
3435 verify_oop(value);
3436 jmp(done);
3437
3438 bind(tagged);
3439 testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3440 jcc(Assembler::notZero, weak_tagged);
3441
3442 // Resolve global handle
3443 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3444 verify_oop(value);
3445 jmp(done);
3446
3447 bind(weak_tagged);
3448 // Resolve jweak.
3449 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3450 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp);
3451 verify_oop(value);
3452
3453 bind(done);
3454 }
3455
3456 void MacroAssembler::resolve_global_jobject(Register value,
3457 Register tmp) {
3458 Register thread = r15_thread;
3459 assert_different_registers(value, thread, tmp);
3460 Label done;
3461
3462 testptr(value, value);
3463 jcc(Assembler::zero, done); // Use null as-is.
3464
3465 #ifdef ASSERT
3466 {
3467 Label valid_global_tag;
3468 testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
3469 jcc(Assembler::notZero, valid_global_tag);
3470 stop("non global jobject using resolve_global_jobject");
3471 bind(valid_global_tag);
3472 }
3473 #endif
3474
3475 // Resolve global handle
3476 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3477 verify_oop(value);
3478
3479 bind(done);
3480 }
3481
3482 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3483 subq(dst, imm32);
3484 }
3485
3486 // Force generation of a 4 byte immediate value even if it fits into 8bit
3487 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3488 subq_imm32(dst, imm32);
3489 }
3490
3491 void MacroAssembler::subptr(Register dst, Register src) {
3492 subq(dst, src);
3493 }
3494
3495 // C++ bool manipulation
3496 void MacroAssembler::testbool(Register dst) {
3497 if(sizeof(bool) == 1)
3498 testb(dst, 0xff);
3499 else if(sizeof(bool) == 2) {
3500 // testw implementation needed for two byte bools
3501 ShouldNotReachHere();
3502 } else if(sizeof(bool) == 4)
3503 testl(dst, dst);
3504 else
3505 // unsupported
3506 ShouldNotReachHere();
3507 }
3508
3509 void MacroAssembler::testptr(Register dst, Register src) {
3510 testq(dst, src);
3511 }
3512
3513 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3514 void MacroAssembler::tlab_allocate(Register obj,
3515 Register var_size_in_bytes,
3516 int con_size_in_bytes,
3517 Register t1,
3518 Register t2,
3519 Label& slow_case) {
3520 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3521 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3522 }
3523
3524 RegSet MacroAssembler::call_clobbered_gp_registers() {
3525 RegSet regs;
3526 regs += RegSet::of(rax, rcx, rdx);
3527 #ifndef _WINDOWS
3528 regs += RegSet::of(rsi, rdi);
3529 #endif
3530 regs += RegSet::range(r8, r11);
3531 if (UseAPX) {
3532 regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1));
3533 }
3534 return regs;
3535 }
3536
3537 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
3538 int num_xmm_registers = XMMRegister::available_xmm_registers();
3539 #if defined(_WINDOWS)
3540 XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
3541 if (num_xmm_registers > 16) {
3542 result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
3543 }
3544 return result;
3545 #else
3546 return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
3547 #endif
3548 }
3549
3550 // C1 only ever uses the first double/float of the XMM register.
3551 static int xmm_save_size() { return sizeof(double); }
3552
3553 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3554 masm->movdbl(Address(rsp, offset), reg);
3555 }
3556
3557 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3558 masm->movdbl(reg, Address(rsp, offset));
3559 }
3560
3561 static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers,
3562 bool save_fpu, int& gp_area_size, int& xmm_area_size) {
3563
3564 gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
3565 StackAlignmentInBytes);
3566 xmm_area_size = save_fpu ? xmm_registers.size() * xmm_save_size() : 0;
3567
3568 return gp_area_size + xmm_area_size;
3569 }
3570
3571 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
3572 block_comment("push_call_clobbered_registers start");
3573 // Regular registers
3574 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
3575
3576 int gp_area_size;
3577 int xmm_area_size;
3578 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
3579 gp_area_size, xmm_area_size);
3580 subptr(rsp, total_save_size);
3581
3582 push_set(gp_registers_to_push, 0);
3583
3584 if (save_fpu) {
3585 push_set(call_clobbered_xmm_registers(), gp_area_size);
3586 }
3587
3588 block_comment("push_call_clobbered_registers end");
3589 }
3590
3591 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
3592 block_comment("pop_call_clobbered_registers start");
3593
3594 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
3595
3596 int gp_area_size;
3597 int xmm_area_size;
3598 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
3599 gp_area_size, xmm_area_size);
3600
3601 if (restore_fpu) {
3602 pop_set(call_clobbered_xmm_registers(), gp_area_size);
3603 }
3604
3605 pop_set(gp_registers_to_pop, 0);
3606
3607 addptr(rsp, total_save_size);
3608
3609 vzeroupper();
3610
3611 block_comment("pop_call_clobbered_registers end");
3612 }
3613
3614 void MacroAssembler::push_set(XMMRegSet set, int offset) {
3615 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
3616 int spill_offset = offset;
3617
3618 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
3619 save_xmm_register(this, spill_offset, *it);
3620 spill_offset += xmm_save_size();
3621 }
3622 }
3623
3624 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
3625 int restore_size = set.size() * xmm_save_size();
3626 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
3627
3628 int restore_offset = offset + restore_size - xmm_save_size();
3629
3630 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
3631 restore_xmm_register(this, restore_offset, *it);
3632 restore_offset -= xmm_save_size();
3633 }
3634 }
3635
3636 void MacroAssembler::push_set(RegSet set, int offset) {
3637 int spill_offset;
3638 if (offset == -1) {
3639 int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3640 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
3641 subptr(rsp, aligned_size);
3642 spill_offset = 0;
3643 } else {
3644 spill_offset = offset;
3645 }
3646
3647 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
3648 movptr(Address(rsp, spill_offset), *it);
3649 spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3650 }
3651 }
3652
3653 void MacroAssembler::pop_set(RegSet set, int offset) {
3654
3655 int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3656 int restore_size = set.size() * gp_reg_size;
3657 int aligned_size = align_up(restore_size, StackAlignmentInBytes);
3658
3659 int restore_offset;
3660 if (offset == -1) {
3661 restore_offset = restore_size - gp_reg_size;
3662 } else {
3663 restore_offset = offset + restore_size - gp_reg_size;
3664 }
3665 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
3666 movptr(*it, Address(rsp, restore_offset));
3667 restore_offset -= gp_reg_size;
3668 }
3669
3670 if (offset == -1) {
3671 addptr(rsp, aligned_size);
3672 }
3673 }
3674
3675 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3676 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3677 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3678 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3679 Label done;
3680
3681 testptr(length_in_bytes, length_in_bytes);
3682 jcc(Assembler::zero, done);
3683
3684 // initialize topmost word, divide index by 2, check if odd and test if zero
3685 // note: for the remaining code to work, index must be a multiple of BytesPerWord
3686 #ifdef ASSERT
3687 {
3688 Label L;
3689 testptr(length_in_bytes, BytesPerWord - 1);
3690 jcc(Assembler::zero, L);
3691 stop("length must be a multiple of BytesPerWord");
3692 bind(L);
3693 }
3694 #endif
3695 Register index = length_in_bytes;
3696 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
3697 if (UseIncDec) {
3698 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
3699 } else {
3700 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
3701 shrptr(index, 1);
3702 }
3703
3704 // initialize remaining object fields: index is a multiple of 2 now
3705 {
3706 Label loop;
3707 bind(loop);
3708 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3709 decrement(index);
3710 jcc(Assembler::notZero, loop);
3711 }
3712
3713 bind(done);
3714 }
3715
3716 // Look up the method for a megamorphic invokeinterface call.
3717 // The target method is determined by <intf_klass, itable_index>.
3718 // The receiver klass is in recv_klass.
3719 // On success, the result will be in method_result, and execution falls through.
3720 // On failure, execution transfers to the given label.
3721 void MacroAssembler::lookup_interface_method(Register recv_klass,
3722 Register intf_klass,
3723 RegisterOrConstant itable_index,
3724 Register method_result,
3725 Register scan_temp,
3726 Label& L_no_such_interface,
3727 bool return_method) {
3728 assert_different_registers(recv_klass, intf_klass, scan_temp);
3729 assert_different_registers(method_result, intf_klass, scan_temp);
3730 assert(recv_klass != method_result || !return_method,
3731 "recv_klass can be destroyed when method isn't needed");
3732
3733 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3734 "caller must use same register for non-constant itable index as for method");
3735
3736 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3737 int vtable_base = in_bytes(Klass::vtable_start_offset());
3738 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3739 int scan_step = itableOffsetEntry::size() * wordSize;
3740 int vte_size = vtableEntry::size_in_bytes();
3741 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3742 assert(vte_size == wordSize, "else adjust times_vte_scale");
3743
3744 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3745
3746 // Could store the aligned, prescaled offset in the klass.
3747 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3748
3749 if (return_method) {
3750 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3751 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3752 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3753 }
3754
3755 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
3756 // if (scan->interface() == intf) {
3757 // result = (klass + scan->offset() + itable_index);
3758 // }
3759 // }
3760 Label search, found_method;
3761
3762 for (int peel = 1; peel >= 0; peel--) {
3763 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
3764 cmpptr(intf_klass, method_result);
3765
3766 if (peel) {
3767 jccb(Assembler::equal, found_method);
3768 } else {
3769 jccb(Assembler::notEqual, search);
3770 // (invert the test to fall through to found_method...)
3771 }
3772
3773 if (!peel) break;
3774
3775 bind(search);
3776
3777 // Check that the previous entry is non-null. A null entry means that
3778 // the receiver class doesn't implement the interface, and wasn't the
3779 // same as when the caller was compiled.
3780 testptr(method_result, method_result);
3781 jcc(Assembler::zero, L_no_such_interface);
3782 addptr(scan_temp, scan_step);
3783 }
3784
3785 bind(found_method);
3786
3787 if (return_method) {
3788 // Got a hit.
3789 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
3790 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3791 }
3792 }
3793
3794 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3795 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3796 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3797 // The target method is determined by <holder_klass, itable_index>.
3798 // The receiver klass is in recv_klass.
3799 // On success, the result will be in method_result, and execution falls through.
3800 // On failure, execution transfers to the given label.
3801 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3802 Register holder_klass,
3803 Register resolved_klass,
3804 Register method_result,
3805 Register scan_temp,
3806 Register temp_reg2,
3807 Register receiver,
3808 int itable_index,
3809 Label& L_no_such_interface) {
3810 assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
3811 Register temp_itbl_klass = method_result;
3812 Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
3813
3814 int vtable_base = in_bytes(Klass::vtable_start_offset());
3815 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3816 int scan_step = itableOffsetEntry::size() * wordSize;
3817 int vte_size = vtableEntry::size_in_bytes();
3818 int ioffset = in_bytes(itableOffsetEntry::interface_offset());
3819 int ooffset = in_bytes(itableOffsetEntry::offset_offset());
3820 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3821 assert(vte_size == wordSize, "adjust times_vte_scale");
3822
3823 Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
3824
3825 // temp_itbl_klass = recv_klass.itable[0]
3826 // scan_temp = &recv_klass.itable[0] + step
3827 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3828 movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
3829 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
3830 xorptr(temp_reg, temp_reg);
3831
3832 // Initial checks:
3833 // - if (holder_klass != resolved_klass), go to "scan for resolved"
3834 // - if (itable[0] == 0), no such interface
3835 // - if (itable[0] == holder_klass), shortcut to "holder found"
3836 cmpptr(holder_klass, resolved_klass);
3837 jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
3838 testptr(temp_itbl_klass, temp_itbl_klass);
3839 jccb(Assembler::zero, L_no_such_interface);
3840 cmpptr(holder_klass, temp_itbl_klass);
3841 jccb(Assembler::equal, L_holder_found);
3842
3843 // Loop: Look for holder_klass record in itable
3844 // do {
3845 // tmp = itable[index];
3846 // index += step;
3847 // if (tmp == holder_klass) {
3848 // goto L_holder_found; // Found!
3849 // }
3850 // } while (tmp != 0);
3851 // goto L_no_such_interface // Not found.
3852 Label L_scan_holder;
3853 bind(L_scan_holder);
3854 movptr(temp_itbl_klass, Address(scan_temp, 0));
3855 addptr(scan_temp, scan_step);
3856 cmpptr(holder_klass, temp_itbl_klass);
3857 jccb(Assembler::equal, L_holder_found);
3858 testptr(temp_itbl_klass, temp_itbl_klass);
3859 jccb(Assembler::notZero, L_scan_holder);
3860
3861 jmpb(L_no_such_interface);
3862
3863 // Loop: Look for resolved_class record in itable
3864 // do {
3865 // tmp = itable[index];
3866 // index += step;
3867 // if (tmp == holder_klass) {
3868 // // Also check if we have met a holder klass
3869 // holder_tmp = itable[index-step-ioffset];
3870 // }
3871 // if (tmp == resolved_klass) {
3872 // goto L_resolved_found; // Found!
3873 // }
3874 // } while (tmp != 0);
3875 // goto L_no_such_interface // Not found.
3876 //
3877 Label L_loop_scan_resolved;
3878 bind(L_loop_scan_resolved);
3879 movptr(temp_itbl_klass, Address(scan_temp, 0));
3880 addptr(scan_temp, scan_step);
3881 bind(L_loop_scan_resolved_entry);
3882 cmpptr(holder_klass, temp_itbl_klass);
3883 cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
3884 cmpptr(resolved_klass, temp_itbl_klass);
3885 jccb(Assembler::equal, L_resolved_found);
3886 testptr(temp_itbl_klass, temp_itbl_klass);
3887 jccb(Assembler::notZero, L_loop_scan_resolved);
3888
3889 jmpb(L_no_such_interface);
3890
3891 Label L_ready;
3892
3893 // See if we already have a holder klass. If not, go and scan for it.
3894 bind(L_resolved_found);
3895 testptr(temp_reg, temp_reg);
3896 jccb(Assembler::zero, L_scan_holder);
3897 jmpb(L_ready);
3898
3899 bind(L_holder_found);
3900 movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
3901
3902 // Finally, temp_reg contains holder_klass vtable offset
3903 bind(L_ready);
3904 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3905 if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
3906 load_klass(scan_temp, receiver, noreg);
3907 movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
3908 } else {
3909 movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
3910 }
3911 }
3912
3913
3914 // virtual method calling
3915 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3916 RegisterOrConstant vtable_index,
3917 Register method_result) {
3918 const ByteSize base = Klass::vtable_start_offset();
3919 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3920 Address vtable_entry_addr(recv_klass,
3921 vtable_index, Address::times_ptr,
3922 base + vtableEntry::method_offset());
3923 movptr(method_result, vtable_entry_addr);
3924 }
3925
3926
3927 void MacroAssembler::check_klass_subtype(Register sub_klass,
3928 Register super_klass,
3929 Register temp_reg,
3930 Label& L_success) {
3931 Label L_failure;
3932 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, nullptr);
3933 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
3934 bind(L_failure);
3935 }
3936
3937
3938 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3939 Register super_klass,
3940 Register temp_reg,
3941 Label* L_success,
3942 Label* L_failure,
3943 Label* L_slow_path,
3944 RegisterOrConstant super_check_offset) {
3945 assert_different_registers(sub_klass, super_klass, temp_reg);
3946 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3947 if (super_check_offset.is_register()) {
3948 assert_different_registers(sub_klass, super_klass,
3949 super_check_offset.as_register());
3950 } else if (must_load_sco) {
3951 assert(temp_reg != noreg, "supply either a temp or a register offset");
3952 }
3953
3954 Label L_fallthrough;
3955 int label_nulls = 0;
3956 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
3957 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
3958 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
3959 assert(label_nulls <= 1, "at most one null in the batch");
3960
3961 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3962 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3963 Address super_check_offset_addr(super_klass, sco_offset);
3964
3965 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3966 // range of a jccb. If this routine grows larger, reconsider at
3967 // least some of these.
3968 #define local_jcc(assembler_cond, label) \
3969 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
3970 else jcc( assembler_cond, label) /*omit semi*/
3971
3972 // Hacked jmp, which may only be used just before L_fallthrough.
3973 #define final_jmp(label) \
3974 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
3975 else jmp(label) /*omit semi*/
3976
3977 // If the pointers are equal, we are done (e.g., String[] elements).
3978 // This self-check enables sharing of secondary supertype arrays among
3979 // non-primary types such as array-of-interface. Otherwise, each such
3980 // type would need its own customized SSA.
3981 // We move this check to the front of the fast path because many
3982 // type checks are in fact trivially successful in this manner,
3983 // so we get a nicely predicted branch right at the start of the check.
3984 cmpptr(sub_klass, super_klass);
3985 local_jcc(Assembler::equal, *L_success);
3986
3987 // Check the supertype display:
3988 if (must_load_sco) {
3989 // Positive movl does right thing on LP64.
3990 movl(temp_reg, super_check_offset_addr);
3991 super_check_offset = RegisterOrConstant(temp_reg);
3992 }
3993 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3994 cmpptr(super_klass, super_check_addr); // load displayed supertype
3995
3996 // This check has worked decisively for primary supers.
3997 // Secondary supers are sought in the super_cache ('super_cache_addr').
3998 // (Secondary supers are interfaces and very deeply nested subtypes.)
3999 // This works in the same check above because of a tricky aliasing
4000 // between the super_cache and the primary super display elements.
4001 // (The 'super_check_addr' can address either, as the case requires.)
4002 // Note that the cache is updated below if it does not help us find
4003 // what we need immediately.
4004 // So if it was a primary super, we can just fail immediately.
4005 // Otherwise, it's the slow path for us (no success at this point).
4006
4007 if (super_check_offset.is_register()) {
4008 local_jcc(Assembler::equal, *L_success);
4009 cmpl(super_check_offset.as_register(), sc_offset);
4010 if (L_failure == &L_fallthrough) {
4011 local_jcc(Assembler::equal, *L_slow_path);
4012 } else {
4013 local_jcc(Assembler::notEqual, *L_failure);
4014 final_jmp(*L_slow_path);
4015 }
4016 } else if (super_check_offset.as_constant() == sc_offset) {
4017 // Need a slow path; fast failure is impossible.
4018 if (L_slow_path == &L_fallthrough) {
4019 local_jcc(Assembler::equal, *L_success);
4020 } else {
4021 local_jcc(Assembler::notEqual, *L_slow_path);
4022 final_jmp(*L_success);
4023 }
4024 } else {
4025 // No slow path; it's a fast decision.
4026 if (L_failure == &L_fallthrough) {
4027 local_jcc(Assembler::equal, *L_success);
4028 } else {
4029 local_jcc(Assembler::notEqual, *L_failure);
4030 final_jmp(*L_success);
4031 }
4032 }
4033
4034 bind(L_fallthrough);
4035
4036 #undef local_jcc
4037 #undef final_jmp
4038 }
4039
4040
4041 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4042 Register super_klass,
4043 Register temp_reg,
4044 Register temp2_reg,
4045 Label* L_success,
4046 Label* L_failure,
4047 bool set_cond_codes) {
4048 assert_different_registers(sub_klass, super_klass, temp_reg);
4049 if (temp2_reg != noreg)
4050 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4051 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4052
4053 Label L_fallthrough;
4054 int label_nulls = 0;
4055 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4056 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4057 assert(label_nulls <= 1, "at most one null in the batch");
4058
4059 // a couple of useful fields in sub_klass:
4060 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4061 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4062 Address secondary_supers_addr(sub_klass, ss_offset);
4063 Address super_cache_addr( sub_klass, sc_offset);
4064
4065 // Do a linear scan of the secondary super-klass chain.
4066 // This code is rarely used, so simplicity is a virtue here.
4067 // The repne_scan instruction uses fixed registers, which we must spill.
4068 // Don't worry too much about pre-existing connections with the input regs.
4069
4070 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4071 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4072
4073 // Get super_klass value into rax (even if it was in rdi or rcx).
4074 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4075 if (super_klass != rax) {
4076 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4077 mov(rax, super_klass);
4078 }
4079 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4080 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4081
4082 #ifndef PRODUCT
4083 uint* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4084 ExternalAddress pst_counter_addr((address) pst_counter);
4085 lea(rcx, pst_counter_addr);
4086 incrementl(Address(rcx, 0));
4087 #endif //PRODUCT
4088
4089 // We will consult the secondary-super array.
4090 movptr(rdi, secondary_supers_addr);
4091 // Load the array length. (Positive movl does right thing on LP64.)
4092 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4093 // Skip to start of data.
4094 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4095
4096 // Scan RCX words at [RDI] for an occurrence of RAX.
4097 // Set NZ/Z based on last compare.
4098 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4099 // not change flags (only scas instruction which is repeated sets flags).
4100 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4101
4102 testptr(rax,rax); // Set Z = 0
4103 repne_scan();
4104
4105 // Unspill the temp. registers:
4106 if (pushed_rdi) pop(rdi);
4107 if (pushed_rcx) pop(rcx);
4108 if (pushed_rax) pop(rax);
4109
4110 if (set_cond_codes) {
4111 // Special hack for the AD files: rdi is guaranteed non-zero.
4112 assert(!pushed_rdi, "rdi must be left non-null");
4113 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4114 }
4115
4116 if (L_failure == &L_fallthrough)
4117 jccb(Assembler::notEqual, *L_failure);
4118 else jcc(Assembler::notEqual, *L_failure);
4119
4120 // Success. Cache the super we found and proceed in triumph.
4121 movptr(super_cache_addr, super_klass);
4122
4123 if (L_success != &L_fallthrough) {
4124 jmp(*L_success);
4125 }
4126
4127 #undef IS_A_TEMP
4128
4129 bind(L_fallthrough);
4130 }
4131
4132 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4133 Register super_klass,
4134 Register temp_reg,
4135 Register temp2_reg,
4136 Label* L_success,
4137 Label* L_failure,
4138 bool set_cond_codes) {
4139 assert(set_cond_codes == false, "must be false on 64-bit x86");
4140 check_klass_subtype_slow_path
4141 (sub_klass, super_klass, temp_reg, temp2_reg, noreg, noreg,
4142 L_success, L_failure);
4143 }
4144
4145 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4146 Register super_klass,
4147 Register temp_reg,
4148 Register temp2_reg,
4149 Register temp3_reg,
4150 Register temp4_reg,
4151 Label* L_success,
4152 Label* L_failure) {
4153 if (UseSecondarySupersTable) {
4154 check_klass_subtype_slow_path_table
4155 (sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, temp4_reg,
4156 L_success, L_failure);
4157 } else {
4158 check_klass_subtype_slow_path_linear
4159 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, /*set_cond_codes*/false);
4160 }
4161 }
4162
4163 Register MacroAssembler::allocate_if_noreg(Register r,
4164 RegSetIterator<Register> &available_regs,
4165 RegSet ®s_to_push) {
4166 if (!r->is_valid()) {
4167 r = *available_regs++;
4168 regs_to_push += r;
4169 }
4170 return r;
4171 }
4172
4173 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4174 Register super_klass,
4175 Register temp_reg,
4176 Register temp2_reg,
4177 Register temp3_reg,
4178 Register result_reg,
4179 Label* L_success,
4180 Label* L_failure) {
4181 // NB! Callers may assume that, when temp2_reg is a valid register,
4182 // this code sets it to a nonzero value.
4183 bool temp2_reg_was_valid = temp2_reg->is_valid();
4184
4185 RegSet temps = RegSet::of(temp_reg, temp2_reg, temp3_reg);
4186
4187 Label L_fallthrough;
4188 int label_nulls = 0;
4189 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4190 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4191 assert(label_nulls <= 1, "at most one null in the batch");
4192
4193 BLOCK_COMMENT("check_klass_subtype_slow_path_table");
4194
4195 RegSetIterator<Register> available_regs
4196 = (RegSet::of(rax, rcx, rdx, r8) + r9 + r10 + r11 + r12 - temps - sub_klass - super_klass).begin();
4197
4198 RegSet pushed_regs;
4199
4200 temp_reg = allocate_if_noreg(temp_reg, available_regs, pushed_regs);
4201 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
4202 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
4203 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4204 Register temp4_reg = allocate_if_noreg(noreg, available_regs, pushed_regs);
4205
4206 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, result_reg);
4207
4208 {
4209
4210 int register_push_size = pushed_regs.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4211 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4212 subptr(rsp, aligned_size);
4213 push_set(pushed_regs, 0);
4214
4215 lookup_secondary_supers_table_var(sub_klass,
4216 super_klass,
4217 temp_reg, temp2_reg, temp3_reg, temp4_reg, result_reg);
4218 cmpq(result_reg, 0);
4219
4220 // Unspill the temp. registers:
4221 pop_set(pushed_regs, 0);
4222 // Increment SP but do not clobber flags.
4223 lea(rsp, Address(rsp, aligned_size));
4224 }
4225
4226 if (temp2_reg_was_valid) {
4227 movq(temp2_reg, 1);
4228 }
4229
4230 jcc(Assembler::notEqual, *L_failure);
4231
4232 if (L_success != &L_fallthrough) {
4233 jmp(*L_success);
4234 }
4235
4236 bind(L_fallthrough);
4237 }
4238
4239 // population_count variant for running without the POPCNT
4240 // instruction, which was introduced with SSE4.2 in 2008.
4241 void MacroAssembler::population_count(Register dst, Register src,
4242 Register scratch1, Register scratch2) {
4243 assert_different_registers(src, scratch1, scratch2);
4244 if (UsePopCountInstruction) {
4245 Assembler::popcntq(dst, src);
4246 } else {
4247 assert_different_registers(src, scratch1, scratch2);
4248 assert_different_registers(dst, scratch1, scratch2);
4249 Label loop, done;
4250
4251 mov(scratch1, src);
4252 // dst = 0;
4253 // while(scratch1 != 0) {
4254 // dst++;
4255 // scratch1 &= (scratch1 - 1);
4256 // }
4257 xorl(dst, dst);
4258 testq(scratch1, scratch1);
4259 jccb(Assembler::equal, done);
4260 {
4261 bind(loop);
4262 incq(dst);
4263 movq(scratch2, scratch1);
4264 decq(scratch2);
4265 andq(scratch1, scratch2);
4266 jccb(Assembler::notEqual, loop);
4267 }
4268 bind(done);
4269 }
4270 #ifdef ASSERT
4271 mov64(scratch1, 0xCafeBabeDeadBeef);
4272 movq(scratch2, scratch1);
4273 #endif
4274 }
4275
4276 // Ensure that the inline code and the stub are using the same registers.
4277 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
4278 do { \
4279 assert(r_super_klass == rax, "mismatch"); \
4280 assert(r_array_base == rbx, "mismatch"); \
4281 assert(r_array_length == rcx, "mismatch"); \
4282 assert(r_array_index == rdx, "mismatch"); \
4283 assert(r_sub_klass == rsi || r_sub_klass == noreg, "mismatch"); \
4284 assert(r_bitmap == r11 || r_bitmap == noreg, "mismatch"); \
4285 assert(result == rdi || result == noreg, "mismatch"); \
4286 } while(0)
4287
4288 // Versions of salq and rorq that don't need count to be in rcx
4289
4290 void MacroAssembler::salq(Register dest, Register count) {
4291 if (count == rcx) {
4292 Assembler::salq(dest);
4293 } else {
4294 assert_different_registers(rcx, dest);
4295 xchgq(rcx, count);
4296 Assembler::salq(dest);
4297 xchgq(rcx, count);
4298 }
4299 }
4300
4301 void MacroAssembler::rorq(Register dest, Register count) {
4302 if (count == rcx) {
4303 Assembler::rorq(dest);
4304 } else {
4305 assert_different_registers(rcx, dest);
4306 xchgq(rcx, count);
4307 Assembler::rorq(dest);
4308 xchgq(rcx, count);
4309 }
4310 }
4311
4312 // Return true: we succeeded in generating this code
4313 //
4314 // At runtime, return 0 in result if r_super_klass is a superclass of
4315 // r_sub_klass, otherwise return nonzero. Use this if you know the
4316 // super_klass_slot of the class you're looking for. This is always
4317 // the case for instanceof and checkcast.
4318 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4319 Register r_super_klass,
4320 Register temp1,
4321 Register temp2,
4322 Register temp3,
4323 Register temp4,
4324 Register result,
4325 u1 super_klass_slot) {
4326 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4327
4328 Label L_fallthrough, L_success, L_failure;
4329
4330 BLOCK_COMMENT("lookup_secondary_supers_table {");
4331
4332 const Register
4333 r_array_index = temp1,
4334 r_array_length = temp2,
4335 r_array_base = temp3,
4336 r_bitmap = temp4;
4337
4338 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4339
4340 xorq(result, result); // = 0
4341
4342 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4343 movq(r_array_index, r_bitmap);
4344
4345 // First check the bitmap to see if super_klass might be present. If
4346 // the bit is zero, we are certain that super_klass is not one of
4347 // the secondary supers.
4348 u1 bit = super_klass_slot;
4349 {
4350 // NB: If the count in a x86 shift instruction is 0, the flags are
4351 // not affected, so we do a testq instead.
4352 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
4353 if (shift_count != 0) {
4354 salq(r_array_index, shift_count);
4355 } else {
4356 testq(r_array_index, r_array_index);
4357 }
4358 }
4359 // We test the MSB of r_array_index, i.e. its sign bit
4360 jcc(Assembler::positive, L_failure);
4361
4362 // Get the first array index that can contain super_klass into r_array_index.
4363 if (bit != 0) {
4364 population_count(r_array_index, r_array_index, temp2, temp3);
4365 } else {
4366 movl(r_array_index, 1);
4367 }
4368 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4369
4370 // We will consult the secondary-super array.
4371 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4372
4373 // We're asserting that the first word in an Array<Klass*> is the
4374 // length, and the second word is the first word of the data. If
4375 // that ever changes, r_array_base will have to be adjusted here.
4376 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4377 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4378
4379 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4380 jccb(Assembler::equal, L_success);
4381
4382 // Is there another entry to check? Consult the bitmap.
4383 btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4384 jccb(Assembler::carryClear, L_failure);
4385
4386 // Linear probe. Rotate the bitmap so that the next bit to test is
4387 // in Bit 1.
4388 if (bit != 0) {
4389 rorq(r_bitmap, bit);
4390 }
4391
4392 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4393 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4394 // Kills: r_array_length.
4395 // Returns: result.
4396 call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub()));
4397 // Result (0/1) is in rdi
4398 jmpb(L_fallthrough);
4399
4400 bind(L_failure);
4401 incq(result); // 0 => 1
4402
4403 bind(L_success);
4404 // result = 0;
4405
4406 bind(L_fallthrough);
4407 BLOCK_COMMENT("} lookup_secondary_supers_table");
4408
4409 if (VerifySecondarySupers) {
4410 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4411 temp1, temp2, temp3);
4412 }
4413 }
4414
4415 // At runtime, return 0 in result if r_super_klass is a superclass of
4416 // r_sub_klass, otherwise return nonzero. Use this version of
4417 // lookup_secondary_supers_table() if you don't know ahead of time
4418 // which superclass will be searched for. Used by interpreter and
4419 // runtime stubs. It is larger and has somewhat greater latency than
4420 // the version above, which takes a constant super_klass_slot.
4421 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4422 Register r_super_klass,
4423 Register temp1,
4424 Register temp2,
4425 Register temp3,
4426 Register temp4,
4427 Register result) {
4428 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4429 assert_different_registers(r_sub_klass, r_super_klass, rcx);
4430 RegSet temps = RegSet::of(temp1, temp2, temp3, temp4);
4431
4432 Label L_fallthrough, L_success, L_failure;
4433
4434 BLOCK_COMMENT("lookup_secondary_supers_table {");
4435
4436 RegSetIterator<Register> available_regs = (temps - rcx).begin();
4437
4438 // FIXME. Once we are sure that all paths reaching this point really
4439 // do pass rcx as one of our temps we can get rid of the following
4440 // workaround.
4441 assert(temps.contains(rcx), "fix this code");
4442
4443 // We prefer to have our shift count in rcx. If rcx is one of our
4444 // temps, use it for slot. If not, pick any of our temps.
4445 Register slot;
4446 if (!temps.contains(rcx)) {
4447 slot = *available_regs++;
4448 } else {
4449 slot = rcx;
4450 }
4451
4452 const Register r_array_index = *available_regs++;
4453 const Register r_bitmap = *available_regs++;
4454
4455 // The logic above guarantees this property, but we state it here.
4456 assert_different_registers(r_array_index, r_bitmap, rcx);
4457
4458 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4459 movq(r_array_index, r_bitmap);
4460
4461 // First check the bitmap to see if super_klass might be present. If
4462 // the bit is zero, we are certain that super_klass is not one of
4463 // the secondary supers.
4464 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4465 xorl(slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1)); // slot ^ 63 === 63 - slot (mod 64)
4466 salq(r_array_index, slot);
4467
4468 testq(r_array_index, r_array_index);
4469 // We test the MSB of r_array_index, i.e. its sign bit
4470 jcc(Assembler::positive, L_failure);
4471
4472 const Register r_array_base = *available_regs++;
4473
4474 // Get the first array index that can contain super_klass into r_array_index.
4475 // Note: Clobbers r_array_base and slot.
4476 population_count(r_array_index, r_array_index, /*temp2*/r_array_base, /*temp3*/slot);
4477
4478 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4479
4480 // We will consult the secondary-super array.
4481 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4482
4483 // We're asserting that the first word in an Array<Klass*> is the
4484 // length, and the second word is the first word of the data. If
4485 // that ever changes, r_array_base will have to be adjusted here.
4486 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4487 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4488
4489 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4490 jccb(Assembler::equal, L_success);
4491
4492 // Restore slot to its true value
4493 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4494
4495 // Linear probe. Rotate the bitmap so that the next bit to test is
4496 // in Bit 1.
4497 rorq(r_bitmap, slot);
4498
4499 // Is there another entry to check? Consult the bitmap.
4500 btq(r_bitmap, 1);
4501 jccb(Assembler::carryClear, L_failure);
4502
4503 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4504 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4505 // Kills: r_array_length.
4506 // Returns: result.
4507 lookup_secondary_supers_table_slow_path(r_super_klass,
4508 r_array_base,
4509 r_array_index,
4510 r_bitmap,
4511 /*temp1*/result,
4512 /*temp2*/slot,
4513 &L_success,
4514 nullptr);
4515
4516 bind(L_failure);
4517 movq(result, 1);
4518 jmpb(L_fallthrough);
4519
4520 bind(L_success);
4521 xorq(result, result); // = 0
4522
4523 bind(L_fallthrough);
4524 BLOCK_COMMENT("} lookup_secondary_supers_table");
4525
4526 if (VerifySecondarySupers) {
4527 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4528 temp1, temp2, temp3);
4529 }
4530 }
4531
4532 void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit,
4533 Label* L_success, Label* L_failure) {
4534 Label L_loop, L_fallthrough;
4535 {
4536 int label_nulls = 0;
4537 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4538 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4539 assert(label_nulls <= 1, "at most one null in the batch");
4540 }
4541 bind(L_loop);
4542 cmpq(value, Address(addr, count, Address::times_8));
4543 jcc(Assembler::equal, *L_success);
4544 addl(count, 1);
4545 cmpl(count, limit);
4546 jcc(Assembler::less, L_loop);
4547
4548 if (&L_fallthrough != L_failure) {
4549 jmp(*L_failure);
4550 }
4551 bind(L_fallthrough);
4552 }
4553
4554 // Called by code generated by check_klass_subtype_slow_path
4555 // above. This is called when there is a collision in the hashed
4556 // lookup in the secondary supers array.
4557 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4558 Register r_array_base,
4559 Register r_array_index,
4560 Register r_bitmap,
4561 Register temp1,
4562 Register temp2,
4563 Label* L_success,
4564 Label* L_failure) {
4565 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2);
4566
4567 const Register
4568 r_array_length = temp1,
4569 r_sub_klass = noreg,
4570 result = noreg;
4571
4572 Label L_fallthrough;
4573 int label_nulls = 0;
4574 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4575 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4576 assert(label_nulls <= 1, "at most one null in the batch");
4577
4578 // Load the array length.
4579 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4580 // And adjust the array base to point to the data.
4581 // NB! Effectively increments current slot index by 1.
4582 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4583 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4584
4585 // Linear probe
4586 Label L_huge;
4587
4588 // The bitmap is full to bursting.
4589 // Implicit invariant: BITMAP_FULL implies (length > 0)
4590 cmpl(r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4591 jcc(Assembler::greater, L_huge);
4592
4593 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4594 // current slot (at secondary_supers[r_array_index]) has not yet
4595 // been inspected, and r_array_index may be out of bounds if we
4596 // wrapped around the end of the array.
4597
4598 { // This is conventional linear probing, but instead of terminating
4599 // when a null entry is found in the table, we maintain a bitmap
4600 // in which a 0 indicates missing entries.
4601 // The check above guarantees there are 0s in the bitmap, so the loop
4602 // eventually terminates.
4603
4604 xorl(temp2, temp2); // = 0;
4605
4606 Label L_again;
4607 bind(L_again);
4608
4609 // Check for array wraparound.
4610 cmpl(r_array_index, r_array_length);
4611 cmovl(Assembler::greaterEqual, r_array_index, temp2);
4612
4613 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4614 jcc(Assembler::equal, *L_success);
4615
4616 // If the next bit in bitmap is zero, we're done.
4617 btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now
4618 jcc(Assembler::carryClear, *L_failure);
4619
4620 rorq(r_bitmap, 1); // Bits 1/2 => 0/1
4621 addl(r_array_index, 1);
4622
4623 jmp(L_again);
4624 }
4625
4626 { // Degenerate case: more than 64 secondary supers.
4627 // FIXME: We could do something smarter here, maybe a vectorized
4628 // comparison or a binary search, but is that worth any added
4629 // complexity?
4630 bind(L_huge);
4631 xorl(r_array_index, r_array_index); // = 0
4632 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length,
4633 L_success,
4634 (&L_fallthrough != L_failure ? L_failure : nullptr));
4635
4636 bind(L_fallthrough);
4637 }
4638 }
4639
4640 struct VerifyHelperArguments {
4641 Klass* _super;
4642 Klass* _sub;
4643 intptr_t _linear_result;
4644 intptr_t _table_result;
4645 };
4646
4647 static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) {
4648 Klass::on_secondary_supers_verification_failure(args->_super,
4649 args->_sub,
4650 args->_linear_result,
4651 args->_table_result,
4652 msg);
4653 }
4654
4655 // Make sure that the hashed lookup and a linear scan agree.
4656 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4657 Register r_super_klass,
4658 Register result,
4659 Register temp1,
4660 Register temp2,
4661 Register temp3) {
4662 const Register
4663 r_array_index = temp1,
4664 r_array_length = temp2,
4665 r_array_base = temp3,
4666 r_bitmap = noreg;
4667
4668 BLOCK_COMMENT("verify_secondary_supers_table {");
4669
4670 Label L_success, L_failure, L_check, L_done;
4671
4672 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4673 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4674 // And adjust the array base to point to the data.
4675 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4676
4677 testl(r_array_length, r_array_length); // array_length == 0?
4678 jcc(Assembler::zero, L_failure);
4679
4680 movl(r_array_index, 0);
4681 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success);
4682 // fall through to L_failure
4683
4684 const Register linear_result = r_array_index; // reuse temp1
4685
4686 bind(L_failure); // not present
4687 movl(linear_result, 1);
4688 jmp(L_check);
4689
4690 bind(L_success); // present
4691 movl(linear_result, 0);
4692
4693 bind(L_check);
4694 cmpl(linear_result, result);
4695 jcc(Assembler::equal, L_done);
4696
4697 { // To avoid calling convention issues, build a record on the stack
4698 // and pass the pointer to that instead.
4699 push(result);
4700 push(linear_result);
4701 push(r_sub_klass);
4702 push(r_super_klass);
4703 movptr(c_rarg1, rsp);
4704 movptr(c_rarg0, (uintptr_t) "mismatch");
4705 call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper)));
4706 should_not_reach_here();
4707 }
4708 bind(L_done);
4709
4710 BLOCK_COMMENT("} verify_secondary_supers_table");
4711 }
4712
4713 #undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS
4714
4715 void MacroAssembler::clinit_barrier(Register klass, Label* L_fast_path, Label* L_slow_path) {
4716 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
4717
4718 Label L_fallthrough;
4719 if (L_fast_path == nullptr) {
4720 L_fast_path = &L_fallthrough;
4721 } else if (L_slow_path == nullptr) {
4722 L_slow_path = &L_fallthrough;
4723 }
4724
4725 // Fast path check: class is fully initialized.
4726 // init_state needs acquire, but x86 is TSO, and so we are already good.
4727 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4728 jcc(Assembler::equal, *L_fast_path);
4729
4730 // Fast path check: current thread is initializer thread
4731 cmpptr(r15_thread, Address(klass, InstanceKlass::init_thread_offset()));
4732 if (L_slow_path == &L_fallthrough) {
4733 jcc(Assembler::equal, *L_fast_path);
4734 bind(*L_slow_path);
4735 } else if (L_fast_path == &L_fallthrough) {
4736 jcc(Assembler::notEqual, *L_slow_path);
4737 bind(*L_fast_path);
4738 } else {
4739 Unimplemented();
4740 }
4741 }
4742
4743 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4744 if (VM_Version::supports_cmov()) {
4745 cmovl(cc, dst, src);
4746 } else {
4747 Label L;
4748 jccb(negate_condition(cc), L);
4749 movl(dst, src);
4750 bind(L);
4751 }
4752 }
4753
4754 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4755 if (VM_Version::supports_cmov()) {
4756 cmovl(cc, dst, src);
4757 } else {
4758 Label L;
4759 jccb(negate_condition(cc), L);
4760 movl(dst, src);
4761 bind(L);
4762 }
4763 }
4764
4765 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4766 if (!VerifyOops) return;
4767
4768 BLOCK_COMMENT("verify_oop {");
4769 push(rscratch1);
4770 push(rax); // save rax
4771 push(reg); // pass register argument
4772
4773 // Pass register number to verify_oop_subroutine
4774 const char* b = nullptr;
4775 {
4776 ResourceMark rm;
4777 stringStream ss;
4778 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4779 b = code_string(ss.as_string());
4780 }
4781 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
4782 pushptr(buffer.addr(), rscratch1);
4783
4784 // call indirectly to solve generation ordering problem
4785 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4786 call(rax);
4787 // Caller pops the arguments (oop, message) and restores rax, r10
4788 BLOCK_COMMENT("} verify_oop");
4789 }
4790
4791 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4792 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4793 // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
4794 // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
4795 vpternlogd(dst, 0xFF, dst, dst, vector_len);
4796 } else if (VM_Version::supports_avx()) {
4797 vpcmpeqd(dst, dst, dst, vector_len);
4798 } else {
4799 pcmpeqd(dst, dst);
4800 }
4801 }
4802
4803 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4804 int extra_slot_offset) {
4805 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4806 int stackElementSize = Interpreter::stackElementSize;
4807 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4808 #ifdef ASSERT
4809 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4810 assert(offset1 - offset == stackElementSize, "correct arithmetic");
4811 #endif
4812 Register scale_reg = noreg;
4813 Address::ScaleFactor scale_factor = Address::no_scale;
4814 if (arg_slot.is_constant()) {
4815 offset += arg_slot.as_constant() * stackElementSize;
4816 } else {
4817 scale_reg = arg_slot.as_register();
4818 scale_factor = Address::times(stackElementSize);
4819 }
4820 offset += wordSize; // return PC is on stack
4821 return Address(rsp, scale_reg, scale_factor, offset);
4822 }
4823
4824 // Handle the receiver type profile update given the "recv" klass.
4825 //
4826 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
4827 // If there are no matching or claimable receiver entries in RD, updates
4828 // the polymorphic counter.
4829 //
4830 // This code expected to run by either the interpreter or JIT-ed code, without
4831 // extra synchronization. For safety, receiver cells are claimed atomically, which
4832 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
4833 // counter updates are not atomic.
4834 //
4835 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
4836 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
4837 int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
4838 int poly_count_offset = in_bytes(CounterData::count_offset());
4839 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
4840 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
4841
4842 // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
4843 assert(is_aligned(mdp_offset, BytesPerWord), "sanity");
4844 base_receiver_offset += mdp_offset;
4845 end_receiver_offset += mdp_offset;
4846 poly_count_offset += mdp_offset;
4847
4848 // Scale down to optimize encoding. Slots are pointer-sized.
4849 assert(is_aligned(base_receiver_offset, BytesPerWord), "sanity");
4850 assert(is_aligned(end_receiver_offset, BytesPerWord), "sanity");
4851 assert(is_aligned(poly_count_offset, BytesPerWord), "sanity");
4852 assert(is_aligned(receiver_step, BytesPerWord), "sanity");
4853 assert(is_aligned(receiver_to_count_step, BytesPerWord), "sanity");
4854 base_receiver_offset >>= LogBytesPerWord;
4855 end_receiver_offset >>= LogBytesPerWord;
4856 poly_count_offset >>= LogBytesPerWord;
4857 receiver_step >>= LogBytesPerWord;
4858 receiver_to_count_step >>= LogBytesPerWord;
4859
4860 #ifdef ASSERT
4861 // We are about to walk the MDO slots without asking for offsets.
4862 // Check that our math hits all the right spots.
4863 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
4864 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
4865 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
4866 int offset = base_receiver_offset + receiver_step*c;
4867 int count_offset = offset + receiver_to_count_step;
4868 assert((offset << LogBytesPerWord) == real_recv_offset, "receiver slot math");
4869 assert((count_offset << LogBytesPerWord) == real_count_offset, "receiver count math");
4870 }
4871 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
4872 assert(poly_count_offset << LogBytesPerWord == real_poly_count_offset, "poly counter math");
4873 #endif
4874
4875 // Corner case: no profile table. Increment poly counter and exit.
4876 if (ReceiverTypeData::row_limit() == 0) {
4877 addptr(Address(mdp, poly_count_offset, Address::times_ptr), DataLayout::counter_increment);
4878 return;
4879 }
4880
4881 Register offset = rscratch1;
4882
4883 Label L_loop_search_receiver, L_loop_search_empty;
4884 Label L_restart, L_found_recv, L_found_empty, L_polymorphic, L_count_update;
4885
4886 // The code here recognizes three major cases:
4887 // A. Fastest: receiver found in the table
4888 // B. Fast: no receiver in the table, and the table is full
4889 // C. Slow: no receiver in the table, free slots in the table
4890 //
4891 // The case A performance is most important, as perfectly-behaved code would end up
4892 // there, especially with larger TypeProfileWidth. The case B performance is
4893 // important as well, this is where bulk of code would land for normally megamorphic
4894 // cases. The case C performance is not essential, its job is to deal with installation
4895 // races, we optimize for code density instead. Case C needs to make sure that receiver
4896 // rows are only claimed once. This makes sure we never overwrite a row for another
4897 // receiver and never duplicate the receivers in the list, making profile type-accurate.
4898 //
4899 // It is very tempting to handle these cases in a single loop, and claim the first slot
4900 // without checking the rest of the table. But, profiling code should tolerate free slots
4901 // in the table, as class unloading can clear them. After such cleanup, the receiver
4902 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
4903 // to complete, before trying to install new slots. Splitting the code in several tight
4904 // loops also helpfully optimizes for cases A and B.
4905 //
4906 // This code is effectively:
4907 //
4908 // restart:
4909 // // Fastest: receiver is already installed
4910 // for (i = 0; i < receiver_count(); i++) {
4911 // if (receiver(i) == recv) goto found_recv(i);
4912 // }
4913 //
4914 // // Fast: no receiver, but profile is full
4915 // for (i = 0; i < receiver_count(); i++) {
4916 // if (receiver(i) == null) goto found_null(i);
4917 // }
4918 // goto polymorphic
4919 //
4920 // // Slow: try to install receiver
4921 // found_null(i):
4922 // CAS(&receiver(i), null, recv);
4923 // goto restart
4924 //
4925 // polymorphic:
4926 // count++;
4927 // return
4928 //
4929 // found_recv(i):
4930 // *receiver_count(i)++
4931 //
4932
4933 bind(L_restart);
4934
4935 // Fastest: receiver is already installed
4936 movptr(offset, base_receiver_offset);
4937 bind(L_loop_search_receiver);
4938 cmpptr(recv, Address(mdp, offset, Address::times_ptr));
4939 jccb(Assembler::equal, L_found_recv);
4940 addptr(offset, receiver_step);
4941 cmpptr(offset, end_receiver_offset);
4942 jccb(Assembler::notEqual, L_loop_search_receiver);
4943
4944 // Fast: no receiver, but profile is full
4945 movptr(offset, base_receiver_offset);
4946 bind(L_loop_search_empty);
4947 cmpptr(Address(mdp, offset, Address::times_ptr), NULL_WORD);
4948 jccb(Assembler::equal, L_found_empty);
4949 addptr(offset, receiver_step);
4950 cmpptr(offset, end_receiver_offset);
4951 jccb(Assembler::notEqual, L_loop_search_empty);
4952 jmpb(L_polymorphic);
4953
4954 // Slow: try to install receiver
4955 bind(L_found_empty);
4956
4957 // Atomically swing receiver slot: null -> recv.
4958 //
4959 // The update code uses CAS, which wants RAX register specifically, *and* it needs
4960 // other important registers untouched, as they form the address. Therefore, we need
4961 // to shift any important registers from RAX into some other spare register. If we
4962 // have a spare register, we are forced to save it on stack here.
4963
4964 Register spare_reg = noreg;
4965 Register shifted_mdp = mdp;
4966 Register shifted_recv = recv;
4967 if (recv == rax || mdp == rax) {
4968 spare_reg = (recv != rbx && mdp != rbx) ? rbx :
4969 (recv != rcx && mdp != rcx) ? rcx :
4970 rdx;
4971 assert_different_registers(mdp, recv, offset, spare_reg);
4972
4973 push(spare_reg);
4974 if (recv == rax) {
4975 movptr(spare_reg, recv);
4976 shifted_recv = spare_reg;
4977 } else {
4978 assert(mdp == rax, "Remaining case");
4979 movptr(spare_reg, mdp);
4980 shifted_mdp = spare_reg;
4981 }
4982 } else {
4983 push(rax);
4984 }
4985
4986 // None of the important registers are in RAX after this shuffle.
4987 assert_different_registers(rax, shifted_mdp, shifted_recv, offset);
4988
4989 xorptr(rax, rax);
4990 cmpxchgptr(shifted_recv, Address(shifted_mdp, offset, Address::times_ptr));
4991
4992 // Unshift registers.
4993 if (recv == rax || mdp == rax) {
4994 movptr(rax, spare_reg);
4995 pop(spare_reg);
4996 } else {
4997 pop(rax);
4998 }
4999
5000 // CAS success means the slot now has the receiver we want. CAS failure means
5001 // something had claimed the slot concurrently: it can be the same receiver we want,
5002 // or something else. Since this is a slow path, we can optimize for code density,
5003 // and just restart the search from the beginning.
5004 jmpb(L_restart);
5005
5006 // Counter updates:
5007
5008 // Increment polymorphic counter instead of receiver slot.
5009 bind(L_polymorphic);
5010 movptr(offset, poly_count_offset);
5011 jmpb(L_count_update);
5012
5013 // Found a receiver, convert its slot offset to corresponding count offset.
5014 bind(L_found_recv);
5015 addptr(offset, receiver_to_count_step);
5016
5017 bind(L_count_update);
5018 addptr(Address(mdp, offset, Address::times_ptr), DataLayout::counter_increment);
5019 }
5020
5021 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
5022 if (!VerifyOops) return;
5023
5024 push(rscratch1);
5025 push(rax); // save rax,
5026 // addr may contain rsp so we will have to adjust it based on the push
5027 // we just did (and on 64 bit we do two pushes)
5028 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5029 // stores rax into addr which is backwards of what was intended.
5030 if (addr.uses(rsp)) {
5031 lea(rax, addr);
5032 pushptr(Address(rax, 2 * BytesPerWord));
5033 } else {
5034 pushptr(addr);
5035 }
5036
5037 // Pass register number to verify_oop_subroutine
5038 const char* b = nullptr;
5039 {
5040 ResourceMark rm;
5041 stringStream ss;
5042 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
5043 b = code_string(ss.as_string());
5044 }
5045 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5046 pushptr(buffer.addr(), rscratch1);
5047
5048 // call indirectly to solve generation ordering problem
5049 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5050 call(rax);
5051 // Caller pops the arguments (addr, message) and restores rax, r10.
5052 }
5053
5054 void MacroAssembler::verify_tlab() {
5055 #ifdef ASSERT
5056 if (UseTLAB && VerifyOops) {
5057 Label next, ok;
5058 Register t1 = rsi;
5059
5060 push(t1);
5061
5062 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5063 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_start_offset())));
5064 jcc(Assembler::aboveEqual, next);
5065 STOP("assert(top >= start)");
5066 should_not_reach_here();
5067
5068 bind(next);
5069 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_end_offset())));
5070 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5071 jcc(Assembler::aboveEqual, ok);
5072 STOP("assert(top <= end)");
5073 should_not_reach_here();
5074
5075 bind(ok);
5076 pop(t1);
5077 }
5078 #endif
5079 }
5080
5081 class ControlWord {
5082 public:
5083 int32_t _value;
5084
5085 int rounding_control() const { return (_value >> 10) & 3 ; }
5086 int precision_control() const { return (_value >> 8) & 3 ; }
5087 bool precision() const { return ((_value >> 5) & 1) != 0; }
5088 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5089 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5090 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5091 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5092 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5093
5094 void print() const {
5095 // rounding control
5096 const char* rc;
5097 switch (rounding_control()) {
5098 case 0: rc = "round near"; break;
5099 case 1: rc = "round down"; break;
5100 case 2: rc = "round up "; break;
5101 case 3: rc = "chop "; break;
5102 default:
5103 rc = nullptr; // silence compiler warnings
5104 fatal("Unknown rounding control: %d", rounding_control());
5105 };
5106 // precision control
5107 const char* pc;
5108 switch (precision_control()) {
5109 case 0: pc = "24 bits "; break;
5110 case 1: pc = "reserved"; break;
5111 case 2: pc = "53 bits "; break;
5112 case 3: pc = "64 bits "; break;
5113 default:
5114 pc = nullptr; // silence compiler warnings
5115 fatal("Unknown precision control: %d", precision_control());
5116 };
5117 // flags
5118 char f[9];
5119 f[0] = ' ';
5120 f[1] = ' ';
5121 f[2] = (precision ()) ? 'P' : 'p';
5122 f[3] = (underflow ()) ? 'U' : 'u';
5123 f[4] = (overflow ()) ? 'O' : 'o';
5124 f[5] = (zero_divide ()) ? 'Z' : 'z';
5125 f[6] = (denormalized()) ? 'D' : 'd';
5126 f[7] = (invalid ()) ? 'I' : 'i';
5127 f[8] = '\x0';
5128 // output
5129 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5130 }
5131
5132 };
5133
5134 class StatusWord {
5135 public:
5136 int32_t _value;
5137
5138 bool busy() const { return ((_value >> 15) & 1) != 0; }
5139 bool C3() const { return ((_value >> 14) & 1) != 0; }
5140 bool C2() const { return ((_value >> 10) & 1) != 0; }
5141 bool C1() const { return ((_value >> 9) & 1) != 0; }
5142 bool C0() const { return ((_value >> 8) & 1) != 0; }
5143 int top() const { return (_value >> 11) & 7 ; }
5144 bool error_status() const { return ((_value >> 7) & 1) != 0; }
5145 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
5146 bool precision() const { return ((_value >> 5) & 1) != 0; }
5147 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5148 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5149 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5150 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5151 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5152
5153 void print() const {
5154 // condition codes
5155 char c[5];
5156 c[0] = (C3()) ? '3' : '-';
5157 c[1] = (C2()) ? '2' : '-';
5158 c[2] = (C1()) ? '1' : '-';
5159 c[3] = (C0()) ? '0' : '-';
5160 c[4] = '\x0';
5161 // flags
5162 char f[9];
5163 f[0] = (error_status()) ? 'E' : '-';
5164 f[1] = (stack_fault ()) ? 'S' : '-';
5165 f[2] = (precision ()) ? 'P' : '-';
5166 f[3] = (underflow ()) ? 'U' : '-';
5167 f[4] = (overflow ()) ? 'O' : '-';
5168 f[5] = (zero_divide ()) ? 'Z' : '-';
5169 f[6] = (denormalized()) ? 'D' : '-';
5170 f[7] = (invalid ()) ? 'I' : '-';
5171 f[8] = '\x0';
5172 // output
5173 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
5174 }
5175
5176 };
5177
5178 class TagWord {
5179 public:
5180 int32_t _value;
5181
5182 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
5183
5184 void print() const {
5185 printf("%04x", _value & 0xFFFF);
5186 }
5187
5188 };
5189
5190 class FPU_Register {
5191 public:
5192 int32_t _m0;
5193 int32_t _m1;
5194 int16_t _ex;
5195
5196 bool is_indefinite() const {
5197 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5198 }
5199
5200 void print() const {
5201 char sign = (_ex < 0) ? '-' : '+';
5202 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
5203 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
5204 };
5205
5206 };
5207
5208 class FPU_State {
5209 public:
5210 enum {
5211 register_size = 10,
5212 number_of_registers = 8,
5213 register_mask = 7
5214 };
5215
5216 ControlWord _control_word;
5217 StatusWord _status_word;
5218 TagWord _tag_word;
5219 int32_t _error_offset;
5220 int32_t _error_selector;
5221 int32_t _data_offset;
5222 int32_t _data_selector;
5223 int8_t _register[register_size * number_of_registers];
5224
5225 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5226 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
5227
5228 const char* tag_as_string(int tag) const {
5229 switch (tag) {
5230 case 0: return "valid";
5231 case 1: return "zero";
5232 case 2: return "special";
5233 case 3: return "empty";
5234 }
5235 ShouldNotReachHere();
5236 return nullptr;
5237 }
5238
5239 void print() const {
5240 // print computation registers
5241 { int t = _status_word.top();
5242 for (int i = 0; i < number_of_registers; i++) {
5243 int j = (i - t) & register_mask;
5244 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5245 st(j)->print();
5246 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5247 }
5248 }
5249 printf("\n");
5250 // print control registers
5251 printf("ctrl = "); _control_word.print(); printf("\n");
5252 printf("stat = "); _status_word .print(); printf("\n");
5253 printf("tags = "); _tag_word .print(); printf("\n");
5254 }
5255
5256 };
5257
5258 class Flag_Register {
5259 public:
5260 int32_t _value;
5261
5262 bool overflow() const { return ((_value >> 11) & 1) != 0; }
5263 bool direction() const { return ((_value >> 10) & 1) != 0; }
5264 bool sign() const { return ((_value >> 7) & 1) != 0; }
5265 bool zero() const { return ((_value >> 6) & 1) != 0; }
5266 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
5267 bool parity() const { return ((_value >> 2) & 1) != 0; }
5268 bool carry() const { return ((_value >> 0) & 1) != 0; }
5269
5270 void print() const {
5271 // flags
5272 char f[8];
5273 f[0] = (overflow ()) ? 'O' : '-';
5274 f[1] = (direction ()) ? 'D' : '-';
5275 f[2] = (sign ()) ? 'S' : '-';
5276 f[3] = (zero ()) ? 'Z' : '-';
5277 f[4] = (auxiliary_carry()) ? 'A' : '-';
5278 f[5] = (parity ()) ? 'P' : '-';
5279 f[6] = (carry ()) ? 'C' : '-';
5280 f[7] = '\x0';
5281 // output
5282 printf("%08x flags = %s", _value, f);
5283 }
5284
5285 };
5286
5287 class IU_Register {
5288 public:
5289 int32_t _value;
5290
5291 void print() const {
5292 printf("%08x %11d", _value, _value);
5293 }
5294
5295 };
5296
5297 class IU_State {
5298 public:
5299 Flag_Register _eflags;
5300 IU_Register _rdi;
5301 IU_Register _rsi;
5302 IU_Register _rbp;
5303 IU_Register _rsp;
5304 IU_Register _rbx;
5305 IU_Register _rdx;
5306 IU_Register _rcx;
5307 IU_Register _rax;
5308
5309 void print() const {
5310 // computation registers
5311 printf("rax, = "); _rax.print(); printf("\n");
5312 printf("rbx, = "); _rbx.print(); printf("\n");
5313 printf("rcx = "); _rcx.print(); printf("\n");
5314 printf("rdx = "); _rdx.print(); printf("\n");
5315 printf("rdi = "); _rdi.print(); printf("\n");
5316 printf("rsi = "); _rsi.print(); printf("\n");
5317 printf("rbp, = "); _rbp.print(); printf("\n");
5318 printf("rsp = "); _rsp.print(); printf("\n");
5319 printf("\n");
5320 // control registers
5321 printf("flgs = "); _eflags.print(); printf("\n");
5322 }
5323 };
5324
5325
5326 class CPU_State {
5327 public:
5328 FPU_State _fpu_state;
5329 IU_State _iu_state;
5330
5331 void print() const {
5332 printf("--------------------------------------------------\n");
5333 _iu_state .print();
5334 printf("\n");
5335 _fpu_state.print();
5336 printf("--------------------------------------------------\n");
5337 }
5338
5339 };
5340
5341
5342 static void _print_CPU_state(CPU_State* state) {
5343 state->print();
5344 };
5345
5346
5347 void MacroAssembler::print_CPU_state() {
5348 push_CPU_state();
5349 push(rsp); // pass CPU state
5350 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5351 addptr(rsp, wordSize); // discard argument
5352 pop_CPU_state();
5353 }
5354
5355 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5356 // Either restore the MXCSR register after returning from the JNI Call
5357 // or verify that it wasn't changed (with -Xcheck:jni flag).
5358 if (VM_Version::supports_sse()) {
5359 if (RestoreMXCSROnJNICalls) {
5360 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5361 } else if (CheckJNICalls) {
5362 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5363 }
5364 }
5365 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5366 vzeroupper();
5367 }
5368
5369 // ((OopHandle)result).resolve();
5370 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5371 assert_different_registers(result, tmp);
5372
5373 // Only 64 bit platforms support GCs that require a tmp register
5374 // Only IN_HEAP loads require a thread_tmp register
5375 // OopHandle::resolve is an indirection like jobject.
5376 access_load_at(T_OBJECT, IN_NATIVE,
5377 result, Address(result, 0), tmp);
5378 }
5379
5380 // ((WeakHandle)result).resolve();
5381 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5382 assert_different_registers(rresult, rtmp);
5383 Label resolved;
5384
5385 // A null weak handle resolves to null.
5386 cmpptr(rresult, 0);
5387 jcc(Assembler::equal, resolved);
5388
5389 // Only 64 bit platforms support GCs that require a tmp register
5390 // Only IN_HEAP loads require a thread_tmp register
5391 // WeakHandle::resolve is an indirection like jweak.
5392 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5393 rresult, Address(rresult, 0), rtmp);
5394 bind(resolved);
5395 }
5396
5397 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5398 // get mirror
5399 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5400 load_method_holder(mirror, method);
5401 movptr(mirror, Address(mirror, mirror_offset));
5402 resolve_oop_handle(mirror, tmp);
5403 }
5404
5405 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5406 load_method_holder(rresult, rmethod);
5407 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5408 }
5409
5410 void MacroAssembler::load_method_holder(Register holder, Register method) {
5411 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
5412 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5413 movptr(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5414 }
5415
5416 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
5417 assert(UseCompactObjectHeaders, "expect compact object headers");
5418 movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
5419 shrq(dst, markWord::klass_shift);
5420 }
5421
5422 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5423 assert_different_registers(src, tmp);
5424 assert_different_registers(dst, tmp);
5425
5426 if (UseCompactObjectHeaders) {
5427 load_narrow_klass_compact(dst, src);
5428 decode_klass_not_null(dst, tmp);
5429 } else {
5430 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5431 decode_klass_not_null(dst, tmp);
5432 }
5433 }
5434
5435 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5436 assert(!UseCompactObjectHeaders, "not with compact headers");
5437 assert_different_registers(src, tmp);
5438 assert_different_registers(dst, tmp);
5439 encode_klass_not_null(src, tmp);
5440 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5441 }
5442
5443 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) {
5444 if (UseCompactObjectHeaders) {
5445 assert(tmp != noreg, "need tmp");
5446 assert_different_registers(klass, obj, tmp);
5447 load_narrow_klass_compact(tmp, obj);
5448 cmpl(klass, tmp);
5449 } else {
5450 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
5451 }
5452 }
5453
5454 void MacroAssembler::cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2) {
5455 if (UseCompactObjectHeaders) {
5456 assert(tmp2 != noreg, "need tmp2");
5457 assert_different_registers(obj1, obj2, tmp1, tmp2);
5458 load_narrow_klass_compact(tmp1, obj1);
5459 load_narrow_klass_compact(tmp2, obj2);
5460 cmpl(tmp1, tmp2);
5461 } else {
5462 movl(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes()));
5463 cmpl(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes()));
5464 }
5465 }
5466
5467 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5468 Register tmp1) {
5469 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5470 decorators = AccessInternal::decorator_fixup(decorators, type);
5471 bool as_raw = (decorators & AS_RAW) != 0;
5472 if (as_raw) {
5473 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1);
5474 } else {
5475 bs->load_at(this, decorators, type, dst, src, tmp1);
5476 }
5477 }
5478
5479 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5480 Register tmp1, Register tmp2, Register tmp3) {
5481 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5482 decorators = AccessInternal::decorator_fixup(decorators, type);
5483 bool as_raw = (decorators & AS_RAW) != 0;
5484 if (as_raw) {
5485 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5486 } else {
5487 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5488 }
5489 }
5490
5491 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5492 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1);
5493 }
5494
5495 // Doesn't do verification, generates fixed size code
5496 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5497 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1);
5498 }
5499
5500 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5501 Register tmp2, Register tmp3, DecoratorSet decorators) {
5502 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5503 }
5504
5505 // Used for storing nulls.
5506 void MacroAssembler::store_heap_oop_null(Address dst) {
5507 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5508 }
5509
5510 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5511 assert(!UseCompactObjectHeaders, "Don't use with compact headers");
5512 // Store to klass gap in destination
5513 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5514 }
5515
5516 #ifdef ASSERT
5517 void MacroAssembler::verify_heapbase(const char* msg) {
5518 assert (UseCompressedOops, "should be compressed");
5519 assert (Universe::heap() != nullptr, "java heap should be initialized");
5520 if (CheckCompressedOops) {
5521 Label ok;
5522 ExternalAddress src2(CompressedOops::base_addr());
5523 const bool is_src2_reachable = reachable(src2);
5524 if (!is_src2_reachable) {
5525 push(rscratch1); // cmpptr trashes rscratch1
5526 }
5527 cmpptr(r12_heapbase, src2, rscratch1);
5528 jcc(Assembler::equal, ok);
5529 STOP(msg);
5530 bind(ok);
5531 if (!is_src2_reachable) {
5532 pop(rscratch1);
5533 }
5534 }
5535 }
5536 #endif
5537
5538 // Algorithm must match oop.inline.hpp encode_heap_oop.
5539 void MacroAssembler::encode_heap_oop(Register r) {
5540 #ifdef ASSERT
5541 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5542 #endif
5543 verify_oop_msg(r, "broken oop in encode_heap_oop");
5544 if (CompressedOops::base() == nullptr) {
5545 if (CompressedOops::shift() != 0) {
5546 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5547 shrq(r, LogMinObjAlignmentInBytes);
5548 }
5549 return;
5550 }
5551 testq(r, r);
5552 cmovq(Assembler::equal, r, r12_heapbase);
5553 subq(r, r12_heapbase);
5554 shrq(r, LogMinObjAlignmentInBytes);
5555 }
5556
5557 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5558 #ifdef ASSERT
5559 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5560 if (CheckCompressedOops) {
5561 Label ok;
5562 testq(r, r);
5563 jcc(Assembler::notEqual, ok);
5564 STOP("null oop passed to encode_heap_oop_not_null");
5565 bind(ok);
5566 }
5567 #endif
5568 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5569 if (CompressedOops::base() != nullptr) {
5570 subq(r, r12_heapbase);
5571 }
5572 if (CompressedOops::shift() != 0) {
5573 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5574 shrq(r, LogMinObjAlignmentInBytes);
5575 }
5576 }
5577
5578 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5579 #ifdef ASSERT
5580 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5581 if (CheckCompressedOops) {
5582 Label ok;
5583 testq(src, src);
5584 jcc(Assembler::notEqual, ok);
5585 STOP("null oop passed to encode_heap_oop_not_null2");
5586 bind(ok);
5587 }
5588 #endif
5589 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5590 if (dst != src) {
5591 movq(dst, src);
5592 }
5593 if (CompressedOops::base() != nullptr) {
5594 subq(dst, r12_heapbase);
5595 }
5596 if (CompressedOops::shift() != 0) {
5597 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5598 shrq(dst, LogMinObjAlignmentInBytes);
5599 }
5600 }
5601
5602 void MacroAssembler::decode_heap_oop(Register r) {
5603 #ifdef ASSERT
5604 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5605 #endif
5606 if (CompressedOops::base() == nullptr) {
5607 if (CompressedOops::shift() != 0) {
5608 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5609 shlq(r, LogMinObjAlignmentInBytes);
5610 }
5611 } else {
5612 Label done;
5613 shlq(r, LogMinObjAlignmentInBytes);
5614 jccb(Assembler::equal, done);
5615 addq(r, r12_heapbase);
5616 bind(done);
5617 }
5618 verify_oop_msg(r, "broken oop in decode_heap_oop");
5619 }
5620
5621 void MacroAssembler::decode_heap_oop_not_null(Register r) {
5622 // Note: it will change flags
5623 assert (UseCompressedOops, "should only be used for compressed headers");
5624 assert (Universe::heap() != nullptr, "java heap should be initialized");
5625 // Cannot assert, unverified entry point counts instructions (see .ad file)
5626 // vtableStubs also counts instructions in pd_code_size_limit.
5627 // Also do not verify_oop as this is called by verify_oop.
5628 if (CompressedOops::shift() != 0) {
5629 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5630 shlq(r, LogMinObjAlignmentInBytes);
5631 if (CompressedOops::base() != nullptr) {
5632 addq(r, r12_heapbase);
5633 }
5634 } else {
5635 assert (CompressedOops::base() == nullptr, "sanity");
5636 }
5637 }
5638
5639 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5640 // Note: it will change flags
5641 assert (UseCompressedOops, "should only be used for compressed headers");
5642 assert (Universe::heap() != nullptr, "java heap should be initialized");
5643 // Cannot assert, unverified entry point counts instructions (see .ad file)
5644 // vtableStubs also counts instructions in pd_code_size_limit.
5645 // Also do not verify_oop as this is called by verify_oop.
5646 if (CompressedOops::shift() != 0) {
5647 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5648 if (LogMinObjAlignmentInBytes == Address::times_8) {
5649 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5650 } else {
5651 if (dst != src) {
5652 movq(dst, src);
5653 }
5654 shlq(dst, LogMinObjAlignmentInBytes);
5655 if (CompressedOops::base() != nullptr) {
5656 addq(dst, r12_heapbase);
5657 }
5658 }
5659 } else {
5660 assert (CompressedOops::base() == nullptr, "sanity");
5661 if (dst != src) {
5662 movq(dst, src);
5663 }
5664 }
5665 }
5666
5667 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5668 BLOCK_COMMENT("encode_klass_not_null {");
5669 assert_different_registers(r, tmp);
5670 if (CompressedKlassPointers::base() != nullptr) {
5671 if (AOTCodeCache::is_on_for_dump()) {
5672 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5673 } else {
5674 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5675 }
5676 subq(r, tmp);
5677 }
5678 if (CompressedKlassPointers::shift() != 0) {
5679 shrq(r, CompressedKlassPointers::shift());
5680 }
5681 BLOCK_COMMENT("} encode_klass_not_null");
5682 }
5683
5684 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5685 BLOCK_COMMENT("encode_and_move_klass_not_null {");
5686 assert_different_registers(src, dst);
5687 if (CompressedKlassPointers::base() != nullptr) {
5688 if (AOTCodeCache::is_on_for_dump()) {
5689 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5690 negq(dst);
5691 } else {
5692 movptr(dst, -(intptr_t)CompressedKlassPointers::base());
5693 }
5694 addq(dst, src);
5695 } else {
5696 movptr(dst, src);
5697 }
5698 if (CompressedKlassPointers::shift() != 0) {
5699 shrq(dst, CompressedKlassPointers::shift());
5700 }
5701 BLOCK_COMMENT("} encode_and_move_klass_not_null");
5702 }
5703
5704 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5705 BLOCK_COMMENT("decode_klass_not_null {");
5706 assert_different_registers(r, tmp);
5707 // Note: it will change flags
5708 // Cannot assert, unverified entry point counts instructions (see .ad file)
5709 // vtableStubs also counts instructions in pd_code_size_limit.
5710 // Also do not verify_oop as this is called by verify_oop.
5711 if (CompressedKlassPointers::shift() != 0) {
5712 shlq(r, CompressedKlassPointers::shift());
5713 }
5714 if (CompressedKlassPointers::base() != nullptr) {
5715 if (AOTCodeCache::is_on_for_dump()) {
5716 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5717 } else {
5718 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5719 }
5720 addq(r, tmp);
5721 }
5722 BLOCK_COMMENT("} decode_klass_not_null");
5723 }
5724
5725 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5726 BLOCK_COMMENT("decode_and_move_klass_not_null {");
5727 assert_different_registers(src, dst);
5728 // Note: it will change flags
5729 // Cannot assert, unverified entry point counts instructions (see .ad file)
5730 // vtableStubs also counts instructions in pd_code_size_limit.
5731 // Also do not verify_oop as this is called by verify_oop.
5732
5733 if (CompressedKlassPointers::base() == nullptr &&
5734 CompressedKlassPointers::shift() == 0) {
5735 // The best case scenario is that there is no base or shift. Then it is already
5736 // a pointer that needs nothing but a register rename.
5737 movl(dst, src);
5738 } else {
5739 if (CompressedKlassPointers::shift() <= Address::times_8) {
5740 if (CompressedKlassPointers::base() != nullptr) {
5741 if (AOTCodeCache::is_on_for_dump()) {
5742 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5743 } else {
5744 movptr(dst, (intptr_t)CompressedKlassPointers::base());
5745 }
5746 } else {
5747 xorq(dst, dst);
5748 }
5749 if (CompressedKlassPointers::shift() != 0) {
5750 assert(CompressedKlassPointers::shift() == Address::times_8, "klass not aligned on 64bits?");
5751 leaq(dst, Address(dst, src, Address::times_8, 0));
5752 } else {
5753 addq(dst, src);
5754 }
5755 } else {
5756 if (CompressedKlassPointers::base() != nullptr) {
5757 if (AOTCodeCache::is_on_for_dump()) {
5758 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5759 shrq(dst, CompressedKlassPointers::shift());
5760 } else {
5761 const intptr_t base_right_shifted =
5762 (intptr_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
5763 movptr(dst, base_right_shifted);
5764 }
5765 } else {
5766 xorq(dst, dst);
5767 }
5768 addq(dst, src);
5769 shlq(dst, CompressedKlassPointers::shift());
5770 }
5771 }
5772 BLOCK_COMMENT("} decode_and_move_klass_not_null");
5773 }
5774
5775 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5776 assert (UseCompressedOops, "should only be used for compressed headers");
5777 assert (Universe::heap() != nullptr, "java heap should be initialized");
5778 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5779 int oop_index = oop_recorder()->find_index(obj);
5780 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5781 mov_narrow_oop(dst, oop_index, rspec);
5782 }
5783
5784 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5785 assert (UseCompressedOops, "should only be used for compressed headers");
5786 assert (Universe::heap() != nullptr, "java heap should be initialized");
5787 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5788 int oop_index = oop_recorder()->find_index(obj);
5789 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5790 mov_narrow_oop(dst, oop_index, rspec);
5791 }
5792
5793 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5794 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5795 int klass_index = oop_recorder()->find_index(k);
5796 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5797 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5798 }
5799
5800 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5801 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5802 int klass_index = oop_recorder()->find_index(k);
5803 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5804 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5805 }
5806
5807 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5808 assert (UseCompressedOops, "should only be used for compressed headers");
5809 assert (Universe::heap() != nullptr, "java heap should be initialized");
5810 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5811 int oop_index = oop_recorder()->find_index(obj);
5812 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5813 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5814 }
5815
5816 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5817 assert (UseCompressedOops, "should only be used for compressed headers");
5818 assert (Universe::heap() != nullptr, "java heap should be initialized");
5819 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5820 int oop_index = oop_recorder()->find_index(obj);
5821 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5822 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5823 }
5824
5825 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5826 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5827 int klass_index = oop_recorder()->find_index(k);
5828 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5829 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5830 }
5831
5832 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5833 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5834 int klass_index = oop_recorder()->find_index(k);
5835 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5836 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5837 }
5838
5839 void MacroAssembler::reinit_heapbase() {
5840 if (UseCompressedOops) {
5841 if (Universe::heap() != nullptr && !AOTCodeCache::is_on_for_dump()) {
5842 if (CompressedOops::base() == nullptr) {
5843 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5844 } else {
5845 mov64(r12_heapbase, (int64_t)CompressedOops::base());
5846 }
5847 } else {
5848 movptr(r12_heapbase, ExternalAddress(CompressedOops::base_addr()));
5849 }
5850 }
5851 }
5852
5853 #if COMPILER2_OR_JVMCI
5854
5855 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5856 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5857 // cnt - number of qwords (8-byte words).
5858 // base - start address, qword aligned.
5859 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5860 bool use64byteVector = (MaxVectorSize == 64) && (CopyAVX3Threshold == 0);
5861 if (use64byteVector) {
5862 vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5863 } else if (MaxVectorSize >= 32) {
5864 vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5865 } else {
5866 pxor(xtmp, xtmp);
5867 }
5868 jmp(L_zero_64_bytes);
5869
5870 BIND(L_loop);
5871 if (MaxVectorSize >= 32) {
5872 fill64(base, 0, xtmp, use64byteVector);
5873 } else {
5874 movdqu(Address(base, 0), xtmp);
5875 movdqu(Address(base, 16), xtmp);
5876 movdqu(Address(base, 32), xtmp);
5877 movdqu(Address(base, 48), xtmp);
5878 }
5879 addptr(base, 64);
5880
5881 BIND(L_zero_64_bytes);
5882 subptr(cnt, 8);
5883 jccb(Assembler::greaterEqual, L_loop);
5884
5885 // Copy trailing 64 bytes
5886 if (use64byteVector) {
5887 addptr(cnt, 8);
5888 jccb(Assembler::equal, L_end);
5889 fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5890 jmp(L_end);
5891 } else {
5892 addptr(cnt, 4);
5893 jccb(Assembler::less, L_tail);
5894 if (MaxVectorSize >= 32) {
5895 vmovdqu(Address(base, 0), xtmp);
5896 } else {
5897 movdqu(Address(base, 0), xtmp);
5898 movdqu(Address(base, 16), xtmp);
5899 }
5900 }
5901 addptr(base, 32);
5902 subptr(cnt, 4);
5903
5904 BIND(L_tail);
5905 addptr(cnt, 4);
5906 jccb(Assembler::lessEqual, L_end);
5907 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5908 fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5909 } else {
5910 decrement(cnt);
5911
5912 BIND(L_sloop);
5913 movq(Address(base, 0), xtmp);
5914 addptr(base, 8);
5915 decrement(cnt);
5916 jccb(Assembler::greaterEqual, L_sloop);
5917 }
5918 BIND(L_end);
5919 }
5920
5921 // Clearing constant sized memory using YMM/ZMM registers.
5922 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5923 assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
5924 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
5925
5926 int vector64_count = (cnt & (~0x7)) >> 3;
5927 cnt = cnt & 0x7;
5928 const int fill64_per_loop = 4;
5929 const int max_unrolled_fill64 = 8;
5930
5931 // 64 byte initialization loop.
5932 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5933 int start64 = 0;
5934 if (vector64_count > max_unrolled_fill64) {
5935 Label LOOP;
5936 Register index = rtmp;
5937
5938 start64 = vector64_count - (vector64_count % fill64_per_loop);
5939
5940 movl(index, 0);
5941 BIND(LOOP);
5942 for (int i = 0; i < fill64_per_loop; i++) {
5943 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5944 }
5945 addl(index, fill64_per_loop * 64);
5946 cmpl(index, start64 * 64);
5947 jccb(Assembler::less, LOOP);
5948 }
5949 for (int i = start64; i < vector64_count; i++) {
5950 fill64(base, i * 64, xtmp, use64byteVector);
5951 }
5952
5953 // Clear remaining 64 byte tail.
5954 int disp = vector64_count * 64;
5955 if (cnt) {
5956 switch (cnt) {
5957 case 1:
5958 movq(Address(base, disp), xtmp);
5959 break;
5960 case 2:
5961 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
5962 break;
5963 case 3:
5964 movl(rtmp, 0x7);
5965 kmovwl(mask, rtmp);
5966 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
5967 break;
5968 case 4:
5969 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5970 break;
5971 case 5:
5972 if (use64byteVector) {
5973 movl(rtmp, 0x1F);
5974 kmovwl(mask, rtmp);
5975 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5976 } else {
5977 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5978 movq(Address(base, disp + 32), xtmp);
5979 }
5980 break;
5981 case 6:
5982 if (use64byteVector) {
5983 movl(rtmp, 0x3F);
5984 kmovwl(mask, rtmp);
5985 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5986 } else {
5987 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5988 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
5989 }
5990 break;
5991 case 7:
5992 if (use64byteVector) {
5993 movl(rtmp, 0x7F);
5994 kmovwl(mask, rtmp);
5995 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5996 } else {
5997 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5998 movl(rtmp, 0x7);
5999 kmovwl(mask, rtmp);
6000 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
6001 }
6002 break;
6003 default:
6004 fatal("Unexpected length : %d\n",cnt);
6005 break;
6006 }
6007 }
6008 }
6009
6010 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
6011 bool is_large, KRegister mask) {
6012 // cnt - number of qwords (8-byte words).
6013 // base - start address, qword aligned.
6014 // is_large - if optimizers know cnt is larger than InitArrayShortSize
6015 assert(base==rdi, "base register must be edi for rep stos");
6016 assert(tmp==rax, "tmp register must be eax for rep stos");
6017 assert(cnt==rcx, "cnt register must be ecx for rep stos");
6018 assert(InitArrayShortSize % BytesPerLong == 0,
6019 "InitArrayShortSize should be the multiple of BytesPerLong");
6020
6021 Label DONE;
6022 if (!is_large || !UseXMMForObjInit) {
6023 xorptr(tmp, tmp);
6024 }
6025
6026 if (!is_large) {
6027 Label LOOP, LONG;
6028 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6029 jccb(Assembler::greater, LONG);
6030
6031 decrement(cnt);
6032 jccb(Assembler::negative, DONE); // Zero length
6033
6034 // Use individual pointer-sized stores for small counts:
6035 BIND(LOOP);
6036 movptr(Address(base, cnt, Address::times_ptr), tmp);
6037 decrement(cnt);
6038 jccb(Assembler::greaterEqual, LOOP);
6039 jmpb(DONE);
6040
6041 BIND(LONG);
6042 }
6043
6044 // Use longer rep-prefixed ops for non-small counts:
6045 if (UseFastStosb) {
6046 shlptr(cnt, 3); // convert to number of bytes
6047 rep_stosb();
6048 } else if (UseXMMForObjInit) {
6049 xmm_clear_mem(base, cnt, tmp, xtmp, mask);
6050 } else {
6051 rep_stos();
6052 }
6053
6054 BIND(DONE);
6055 }
6056
6057 #endif //COMPILER2_OR_JVMCI
6058
6059
6060 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6061 Register to, Register value, Register count,
6062 Register rtmp, XMMRegister xtmp) {
6063 ShortBranchVerifier sbv(this);
6064 assert_different_registers(to, value, count, rtmp);
6065 Label L_exit;
6066 Label L_fill_2_bytes, L_fill_4_bytes;
6067
6068 #if defined(COMPILER2)
6069 if(MaxVectorSize >=32 &&
6070 VM_Version::supports_avx512vlbw() &&
6071 VM_Version::supports_bmi2()) {
6072 generate_fill_avx3(t, to, value, count, rtmp, xtmp);
6073 return;
6074 }
6075 #endif
6076
6077 int shift = -1;
6078 switch (t) {
6079 case T_BYTE:
6080 shift = 2;
6081 break;
6082 case T_SHORT:
6083 shift = 1;
6084 break;
6085 case T_INT:
6086 shift = 0;
6087 break;
6088 default: ShouldNotReachHere();
6089 }
6090
6091 if (t == T_BYTE) {
6092 andl(value, 0xff);
6093 movl(rtmp, value);
6094 shll(rtmp, 8);
6095 orl(value, rtmp);
6096 }
6097 if (t == T_SHORT) {
6098 andl(value, 0xffff);
6099 }
6100 if (t == T_BYTE || t == T_SHORT) {
6101 movl(rtmp, value);
6102 shll(rtmp, 16);
6103 orl(value, rtmp);
6104 }
6105
6106 cmpptr(count, 8 << shift); // Short arrays (< 32 bytes) fill by element
6107 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6108 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6109 Label L_skip_align2;
6110 // align source address at 4 bytes address boundary
6111 if (t == T_BYTE) {
6112 Label L_skip_align1;
6113 // One byte misalignment happens only for byte arrays
6114 testptr(to, 1);
6115 jccb(Assembler::zero, L_skip_align1);
6116 movb(Address(to, 0), value);
6117 increment(to);
6118 decrement(count);
6119 BIND(L_skip_align1);
6120 }
6121 // Two bytes misalignment happens only for byte and short (char) arrays
6122 testptr(to, 2);
6123 jccb(Assembler::zero, L_skip_align2);
6124 movw(Address(to, 0), value);
6125 addptr(to, 2);
6126 subptr(count, 1<<(shift-1));
6127 BIND(L_skip_align2);
6128 }
6129 {
6130 Label L_fill_32_bytes;
6131 if (!UseUnalignedLoadStores) {
6132 // align to 8 bytes, we know we are 4 byte aligned to start
6133 testptr(to, 4);
6134 jccb(Assembler::zero, L_fill_32_bytes);
6135 movl(Address(to, 0), value);
6136 addptr(to, 4);
6137 subptr(count, 1<<shift);
6138 }
6139 BIND(L_fill_32_bytes);
6140 {
6141 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6142 movdl(xtmp, value);
6143 if (UseAVX >= 2 && UseUnalignedLoadStores) {
6144 Label L_check_fill_32_bytes;
6145 if (UseAVX > 2) {
6146 // Fill 64-byte chunks
6147 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
6148
6149 // If number of bytes to fill < CopyAVX3Threshold, perform fill using AVX2
6150 cmpptr(count, CopyAVX3Threshold);
6151 jccb(Assembler::below, L_check_fill_64_bytes_avx2);
6152
6153 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
6154
6155 subptr(count, 16 << shift);
6156 jcc(Assembler::less, L_check_fill_32_bytes);
6157 align(16);
6158
6159 BIND(L_fill_64_bytes_loop_avx3);
6160 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
6161 addptr(to, 64);
6162 subptr(count, 16 << shift);
6163 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
6164 jmpb(L_check_fill_32_bytes);
6165
6166 BIND(L_check_fill_64_bytes_avx2);
6167 }
6168 // Fill 64-byte chunks
6169 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
6170
6171 subptr(count, 16 << shift);
6172 jcc(Assembler::less, L_check_fill_32_bytes);
6173
6174 // align data for 64-byte chunks
6175 Label L_fill_64_bytes_loop, L_align_64_bytes_loop;
6176 if (EnableX86ECoreOpts) {
6177 // align 'big' arrays to cache lines to minimize split_stores
6178 cmpptr(count, 96 << shift);
6179 jcc(Assembler::below, L_fill_64_bytes_loop);
6180
6181 // Find the bytes needed for alignment
6182 movptr(rtmp, to);
6183 andptr(rtmp, 0x1c);
6184 jcc(Assembler::zero, L_fill_64_bytes_loop);
6185 negptr(rtmp); // number of bytes to fill 32-rtmp. it filled by 2 mov by 32
6186 addptr(rtmp, 32);
6187 shrptr(rtmp, 2 - shift);// get number of elements from bytes
6188 subptr(count, rtmp); // adjust count by number of elements
6189
6190 align(16);
6191 BIND(L_align_64_bytes_loop);
6192 movdl(Address(to, 0), xtmp);
6193 addptr(to, 4);
6194 subptr(rtmp, 1 << shift);
6195 jcc(Assembler::greater, L_align_64_bytes_loop);
6196 }
6197
6198 align(16);
6199 BIND(L_fill_64_bytes_loop);
6200 vmovdqu(Address(to, 0), xtmp);
6201 vmovdqu(Address(to, 32), xtmp);
6202 addptr(to, 64);
6203 subptr(count, 16 << shift);
6204 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6205
6206 align(16);
6207 BIND(L_check_fill_32_bytes);
6208 addptr(count, 8 << shift);
6209 jccb(Assembler::less, L_check_fill_8_bytes);
6210 vmovdqu(Address(to, 0), xtmp);
6211 addptr(to, 32);
6212 subptr(count, 8 << shift);
6213
6214 BIND(L_check_fill_8_bytes);
6215 // clean upper bits of YMM registers
6216 movdl(xtmp, value);
6217 pshufd(xtmp, xtmp, 0);
6218 } else {
6219 // Fill 32-byte chunks
6220 pshufd(xtmp, xtmp, 0);
6221
6222 subptr(count, 8 << shift);
6223 jcc(Assembler::less, L_check_fill_8_bytes);
6224 align(16);
6225
6226 BIND(L_fill_32_bytes_loop);
6227
6228 if (UseUnalignedLoadStores) {
6229 movdqu(Address(to, 0), xtmp);
6230 movdqu(Address(to, 16), xtmp);
6231 } else {
6232 movq(Address(to, 0), xtmp);
6233 movq(Address(to, 8), xtmp);
6234 movq(Address(to, 16), xtmp);
6235 movq(Address(to, 24), xtmp);
6236 }
6237
6238 addptr(to, 32);
6239 subptr(count, 8 << shift);
6240 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6241
6242 BIND(L_check_fill_8_bytes);
6243 }
6244 addptr(count, 8 << shift);
6245 jccb(Assembler::zero, L_exit);
6246 jmpb(L_fill_8_bytes);
6247
6248 //
6249 // length is too short, just fill qwords
6250 //
6251 align(16);
6252 BIND(L_fill_8_bytes_loop);
6253 movq(Address(to, 0), xtmp);
6254 addptr(to, 8);
6255 BIND(L_fill_8_bytes);
6256 subptr(count, 1 << (shift + 1));
6257 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6258 }
6259 }
6260
6261 Label L_fill_4_bytes_loop;
6262 testl(count, 1 << shift);
6263 jccb(Assembler::zero, L_fill_2_bytes);
6264
6265 align(16);
6266 BIND(L_fill_4_bytes_loop);
6267 movl(Address(to, 0), value);
6268 addptr(to, 4);
6269
6270 BIND(L_fill_4_bytes);
6271 subptr(count, 1 << shift);
6272 jccb(Assembler::greaterEqual, L_fill_4_bytes_loop);
6273
6274 if (t == T_BYTE || t == T_SHORT) {
6275 Label L_fill_byte;
6276 BIND(L_fill_2_bytes);
6277 // fill trailing 2 bytes
6278 testl(count, 1<<(shift-1));
6279 jccb(Assembler::zero, L_fill_byte);
6280 movw(Address(to, 0), value);
6281 if (t == T_BYTE) {
6282 addptr(to, 2);
6283 BIND(L_fill_byte);
6284 // fill trailing byte
6285 testl(count, 1);
6286 jccb(Assembler::zero, L_exit);
6287 movb(Address(to, 0), value);
6288 } else {
6289 BIND(L_fill_byte);
6290 }
6291 } else {
6292 BIND(L_fill_2_bytes);
6293 }
6294 BIND(L_exit);
6295 }
6296
6297 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
6298 switch(type) {
6299 case T_BYTE:
6300 case T_BOOLEAN:
6301 evpbroadcastb(dst, src, vector_len);
6302 break;
6303 case T_SHORT:
6304 case T_CHAR:
6305 evpbroadcastw(dst, src, vector_len);
6306 break;
6307 case T_INT:
6308 case T_FLOAT:
6309 evpbroadcastd(dst, src, vector_len);
6310 break;
6311 case T_LONG:
6312 case T_DOUBLE:
6313 evpbroadcastq(dst, src, vector_len);
6314 break;
6315 default:
6316 fatal("Unhandled type : %s", type2name(type));
6317 break;
6318 }
6319 }
6320
6321 // Encode given char[]/byte[] to byte[] in ISO_8859_1 or ASCII
6322 //
6323 // @IntrinsicCandidate
6324 // int sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(
6325 // char[] sa, int sp, byte[] da, int dp, int len) {
6326 // int i = 0;
6327 // for (; i < len; i++) {
6328 // char c = sa[sp++];
6329 // if (c > '\u00FF')
6330 // break;
6331 // da[dp++] = (byte) c;
6332 // }
6333 // return i;
6334 // }
6335 //
6336 // @IntrinsicCandidate
6337 // int java.lang.StringCoding.encodeISOArray0(
6338 // byte[] sa, int sp, byte[] da, int dp, int len) {
6339 // int i = 0;
6340 // for (; i < len; i++) {
6341 // char c = StringUTF16.getChar(sa, sp++);
6342 // if (c > '\u00FF')
6343 // break;
6344 // da[dp++] = (byte) c;
6345 // }
6346 // return i;
6347 // }
6348 //
6349 // @IntrinsicCandidate
6350 // int java.lang.StringCoding.encodeAsciiArray0(
6351 // char[] sa, int sp, byte[] da, int dp, int len) {
6352 // int i = 0;
6353 // for (; i < len; i++) {
6354 // char c = sa[sp++];
6355 // if (c >= '\u0080')
6356 // break;
6357 // da[dp++] = (byte) c;
6358 // }
6359 // return i;
6360 // }
6361 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6362 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6363 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6364 Register tmp5, Register result, bool ascii) {
6365
6366 // rsi: src
6367 // rdi: dst
6368 // rdx: len
6369 // rcx: tmp5
6370 // rax: result
6371 ShortBranchVerifier sbv(this);
6372 assert_different_registers(src, dst, len, tmp5, result);
6373 Label L_done, L_copy_1_char, L_copy_1_char_exit;
6374
6375 int mask = ascii ? 0xff80ff80 : 0xff00ff00;
6376 int short_mask = ascii ? 0xff80 : 0xff00;
6377
6378 // set result
6379 xorl(result, result);
6380 // check for zero length
6381 testl(len, len);
6382 jcc(Assembler::zero, L_done);
6383
6384 movl(result, len);
6385
6386 // Setup pointers
6387 lea(src, Address(src, len, Address::times_2)); // char[]
6388 lea(dst, Address(dst, len, Address::times_1)); // byte[]
6389 negptr(len);
6390
6391 if (UseSSE42Intrinsics || UseAVX >= 2) {
6392 Label L_copy_8_chars, L_copy_8_chars_exit;
6393 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6394
6395 if (UseAVX >= 2) {
6396 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6397 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6398 movdl(tmp1Reg, tmp5);
6399 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
6400 jmp(L_chars_32_check);
6401
6402 bind(L_copy_32_chars);
6403 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6404 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6405 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6406 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6407 jccb(Assembler::notZero, L_copy_32_chars_exit);
6408 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6409 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
6410 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6411
6412 bind(L_chars_32_check);
6413 addptr(len, 32);
6414 jcc(Assembler::lessEqual, L_copy_32_chars);
6415
6416 bind(L_copy_32_chars_exit);
6417 subptr(len, 16);
6418 jccb(Assembler::greater, L_copy_16_chars_exit);
6419
6420 } else if (UseSSE42Intrinsics) {
6421 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6422 movdl(tmp1Reg, tmp5);
6423 pshufd(tmp1Reg, tmp1Reg, 0);
6424 jmpb(L_chars_16_check);
6425 }
6426
6427 bind(L_copy_16_chars);
6428 if (UseAVX >= 2) {
6429 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6430 vptest(tmp2Reg, tmp1Reg);
6431 jcc(Assembler::notZero, L_copy_16_chars_exit);
6432 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
6433 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
6434 } else {
6435 if (UseAVX > 0) {
6436 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6437 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6438 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
6439 } else {
6440 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6441 por(tmp2Reg, tmp3Reg);
6442 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6443 por(tmp2Reg, tmp4Reg);
6444 }
6445 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6446 jccb(Assembler::notZero, L_copy_16_chars_exit);
6447 packuswb(tmp3Reg, tmp4Reg);
6448 }
6449 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6450
6451 bind(L_chars_16_check);
6452 addptr(len, 16);
6453 jcc(Assembler::lessEqual, L_copy_16_chars);
6454
6455 bind(L_copy_16_chars_exit);
6456 if (UseAVX >= 2) {
6457 // clean upper bits of YMM registers
6458 vpxor(tmp2Reg, tmp2Reg);
6459 vpxor(tmp3Reg, tmp3Reg);
6460 vpxor(tmp4Reg, tmp4Reg);
6461 movdl(tmp1Reg, tmp5);
6462 pshufd(tmp1Reg, tmp1Reg, 0);
6463 }
6464 subptr(len, 8);
6465 jccb(Assembler::greater, L_copy_8_chars_exit);
6466
6467 bind(L_copy_8_chars);
6468 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6469 ptest(tmp3Reg, tmp1Reg);
6470 jccb(Assembler::notZero, L_copy_8_chars_exit);
6471 packuswb(tmp3Reg, tmp1Reg);
6472 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6473 addptr(len, 8);
6474 jccb(Assembler::lessEqual, L_copy_8_chars);
6475
6476 bind(L_copy_8_chars_exit);
6477 subptr(len, 8);
6478 jccb(Assembler::zero, L_done);
6479 }
6480
6481 bind(L_copy_1_char);
6482 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6483 testl(tmp5, short_mask); // check if Unicode or non-ASCII char
6484 jccb(Assembler::notZero, L_copy_1_char_exit);
6485 movb(Address(dst, len, Address::times_1, 0), tmp5);
6486 addptr(len, 1);
6487 jccb(Assembler::less, L_copy_1_char);
6488
6489 bind(L_copy_1_char_exit);
6490 addptr(result, len); // len is negative count of not processed elements
6491
6492 bind(L_done);
6493 }
6494
6495 /**
6496 * Helper for multiply_to_len().
6497 */
6498 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6499 addq(dest_lo, src1);
6500 adcq(dest_hi, 0);
6501 addq(dest_lo, src2);
6502 adcq(dest_hi, 0);
6503 }
6504
6505 /**
6506 * Multiply 64 bit by 64 bit first loop.
6507 */
6508 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6509 Register y, Register y_idx, Register z,
6510 Register carry, Register product,
6511 Register idx, Register kdx) {
6512 //
6513 // jlong carry, x[], y[], z[];
6514 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6515 // huge_128 product = y[idx] * x[xstart] + carry;
6516 // z[kdx] = (jlong)product;
6517 // carry = (jlong)(product >>> 64);
6518 // }
6519 // z[xstart] = carry;
6520 //
6521
6522 Label L_first_loop, L_first_loop_exit;
6523 Label L_one_x, L_one_y, L_multiply;
6524
6525 decrementl(xstart);
6526 jcc(Assembler::negative, L_one_x);
6527
6528 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6529 rorq(x_xstart, 32); // convert big-endian to little-endian
6530
6531 bind(L_first_loop);
6532 decrementl(idx);
6533 jcc(Assembler::negative, L_first_loop_exit);
6534 decrementl(idx);
6535 jcc(Assembler::negative, L_one_y);
6536 movq(y_idx, Address(y, idx, Address::times_4, 0));
6537 rorq(y_idx, 32); // convert big-endian to little-endian
6538 bind(L_multiply);
6539 movq(product, x_xstart);
6540 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
6541 addq(product, carry);
6542 adcq(rdx, 0);
6543 subl(kdx, 2);
6544 movl(Address(z, kdx, Address::times_4, 4), product);
6545 shrq(product, 32);
6546 movl(Address(z, kdx, Address::times_4, 0), product);
6547 movq(carry, rdx);
6548 jmp(L_first_loop);
6549
6550 bind(L_one_y);
6551 movl(y_idx, Address(y, 0));
6552 jmp(L_multiply);
6553
6554 bind(L_one_x);
6555 movl(x_xstart, Address(x, 0));
6556 jmp(L_first_loop);
6557
6558 bind(L_first_loop_exit);
6559 }
6560
6561 /**
6562 * Multiply 64 bit by 64 bit and add 128 bit.
6563 */
6564 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
6565 Register yz_idx, Register idx,
6566 Register carry, Register product, int offset) {
6567 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
6568 // z[kdx] = (jlong)product;
6569
6570 movq(yz_idx, Address(y, idx, Address::times_4, offset));
6571 rorq(yz_idx, 32); // convert big-endian to little-endian
6572 movq(product, x_xstart);
6573 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6574 movq(yz_idx, Address(z, idx, Address::times_4, offset));
6575 rorq(yz_idx, 32); // convert big-endian to little-endian
6576
6577 add2_with_carry(rdx, product, carry, yz_idx);
6578
6579 movl(Address(z, idx, Address::times_4, offset+4), product);
6580 shrq(product, 32);
6581 movl(Address(z, idx, Address::times_4, offset), product);
6582
6583 }
6584
6585 /**
6586 * Multiply 128 bit by 128 bit. Unrolled inner loop.
6587 */
6588 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
6589 Register yz_idx, Register idx, Register jdx,
6590 Register carry, Register product,
6591 Register carry2) {
6592 // jlong carry, x[], y[], z[];
6593 // int kdx = ystart+1;
6594 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6595 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
6596 // z[kdx+idx+1] = (jlong)product;
6597 // jlong carry2 = (jlong)(product >>> 64);
6598 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6599 // z[kdx+idx] = (jlong)product;
6600 // carry = (jlong)(product >>> 64);
6601 // }
6602 // idx += 2;
6603 // if (idx > 0) {
6604 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6605 // z[kdx+idx] = (jlong)product;
6606 // carry = (jlong)(product >>> 64);
6607 // }
6608 //
6609
6610 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6611
6612 movl(jdx, idx);
6613 andl(jdx, 0xFFFFFFFC);
6614 shrl(jdx, 2);
6615
6616 bind(L_third_loop);
6617 subl(jdx, 1);
6618 jcc(Assembler::negative, L_third_loop_exit);
6619 subl(idx, 4);
6620
6621 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6622 movq(carry2, rdx);
6623
6624 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6625 movq(carry, rdx);
6626 jmp(L_third_loop);
6627
6628 bind (L_third_loop_exit);
6629
6630 andl (idx, 0x3);
6631 jcc(Assembler::zero, L_post_third_loop_done);
6632
6633 Label L_check_1;
6634 subl(idx, 2);
6635 jcc(Assembler::negative, L_check_1);
6636
6637 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6638 movq(carry, rdx);
6639
6640 bind (L_check_1);
6641 addl (idx, 0x2);
6642 andl (idx, 0x1);
6643 subl(idx, 1);
6644 jcc(Assembler::negative, L_post_third_loop_done);
6645
6646 movl(yz_idx, Address(y, idx, Address::times_4, 0));
6647 movq(product, x_xstart);
6648 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6649 movl(yz_idx, Address(z, idx, Address::times_4, 0));
6650
6651 add2_with_carry(rdx, product, yz_idx, carry);
6652
6653 movl(Address(z, idx, Address::times_4, 0), product);
6654 shrq(product, 32);
6655
6656 shlq(rdx, 32);
6657 orq(product, rdx);
6658 movq(carry, product);
6659
6660 bind(L_post_third_loop_done);
6661 }
6662
6663 /**
6664 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6665 *
6666 */
6667 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6668 Register carry, Register carry2,
6669 Register idx, Register jdx,
6670 Register yz_idx1, Register yz_idx2,
6671 Register tmp, Register tmp3, Register tmp4) {
6672 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6673
6674 // jlong carry, x[], y[], z[];
6675 // int kdx = ystart+1;
6676 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6677 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6678 // jlong carry2 = (jlong)(tmp3 >>> 64);
6679 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
6680 // carry = (jlong)(tmp4 >>> 64);
6681 // z[kdx+idx+1] = (jlong)tmp3;
6682 // z[kdx+idx] = (jlong)tmp4;
6683 // }
6684 // idx += 2;
6685 // if (idx > 0) {
6686 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6687 // z[kdx+idx] = (jlong)yz_idx1;
6688 // carry = (jlong)(yz_idx1 >>> 64);
6689 // }
6690 //
6691
6692 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6693
6694 movl(jdx, idx);
6695 andl(jdx, 0xFFFFFFFC);
6696 shrl(jdx, 2);
6697
6698 bind(L_third_loop);
6699 subl(jdx, 1);
6700 jcc(Assembler::negative, L_third_loop_exit);
6701 subl(idx, 4);
6702
6703 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
6704 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6705 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
6706 rorxq(yz_idx2, yz_idx2, 32);
6707
6708 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6709 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
6710
6711 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
6712 rorxq(yz_idx1, yz_idx1, 32);
6713 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6714 rorxq(yz_idx2, yz_idx2, 32);
6715
6716 if (VM_Version::supports_adx()) {
6717 adcxq(tmp3, carry);
6718 adoxq(tmp3, yz_idx1);
6719
6720 adcxq(tmp4, tmp);
6721 adoxq(tmp4, yz_idx2);
6722
6723 movl(carry, 0); // does not affect flags
6724 adcxq(carry2, carry);
6725 adoxq(carry2, carry);
6726 } else {
6727 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6728 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6729 }
6730 movq(carry, carry2);
6731
6732 movl(Address(z, idx, Address::times_4, 12), tmp3);
6733 shrq(tmp3, 32);
6734 movl(Address(z, idx, Address::times_4, 8), tmp3);
6735
6736 movl(Address(z, idx, Address::times_4, 4), tmp4);
6737 shrq(tmp4, 32);
6738 movl(Address(z, idx, Address::times_4, 0), tmp4);
6739
6740 jmp(L_third_loop);
6741
6742 bind (L_third_loop_exit);
6743
6744 andl (idx, 0x3);
6745 jcc(Assembler::zero, L_post_third_loop_done);
6746
6747 Label L_check_1;
6748 subl(idx, 2);
6749 jcc(Assembler::negative, L_check_1);
6750
6751 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
6752 rorxq(yz_idx1, yz_idx1, 32);
6753 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6754 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6755 rorxq(yz_idx2, yz_idx2, 32);
6756
6757 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6758
6759 movl(Address(z, idx, Address::times_4, 4), tmp3);
6760 shrq(tmp3, 32);
6761 movl(Address(z, idx, Address::times_4, 0), tmp3);
6762 movq(carry, tmp4);
6763
6764 bind (L_check_1);
6765 addl (idx, 0x2);
6766 andl (idx, 0x1);
6767 subl(idx, 1);
6768 jcc(Assembler::negative, L_post_third_loop_done);
6769 movl(tmp4, Address(y, idx, Address::times_4, 0));
6770 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
6771 movl(tmp4, Address(z, idx, Address::times_4, 0));
6772
6773 add2_with_carry(carry2, tmp3, tmp4, carry);
6774
6775 movl(Address(z, idx, Address::times_4, 0), tmp3);
6776 shrq(tmp3, 32);
6777
6778 shlq(carry2, 32);
6779 orq(tmp3, carry2);
6780 movq(carry, tmp3);
6781
6782 bind(L_post_third_loop_done);
6783 }
6784
6785 /**
6786 * Code for BigInteger::multiplyToLen() intrinsic.
6787 *
6788 * rdi: x
6789 * rax: xlen
6790 * rsi: y
6791 * rcx: ylen
6792 * r8: z
6793 * r11: tmp0
6794 * r12: tmp1
6795 * r13: tmp2
6796 * r14: tmp3
6797 * r15: tmp4
6798 * rbx: tmp5
6799 *
6800 */
6801 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
6802 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6803 ShortBranchVerifier sbv(this);
6804 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6805
6806 push(tmp0);
6807 push(tmp1);
6808 push(tmp2);
6809 push(tmp3);
6810 push(tmp4);
6811 push(tmp5);
6812
6813 push(xlen);
6814
6815 const Register idx = tmp1;
6816 const Register kdx = tmp2;
6817 const Register xstart = tmp3;
6818
6819 const Register y_idx = tmp4;
6820 const Register carry = tmp5;
6821 const Register product = xlen;
6822 const Register x_xstart = tmp0;
6823
6824 // First Loop.
6825 //
6826 // final static long LONG_MASK = 0xffffffffL;
6827 // int xstart = xlen - 1;
6828 // int ystart = ylen - 1;
6829 // long carry = 0;
6830 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6831 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6832 // z[kdx] = (int)product;
6833 // carry = product >>> 32;
6834 // }
6835 // z[xstart] = (int)carry;
6836 //
6837
6838 movl(idx, ylen); // idx = ylen;
6839 lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen;
6840 xorq(carry, carry); // carry = 0;
6841
6842 Label L_done;
6843
6844 movl(xstart, xlen);
6845 decrementl(xstart);
6846 jcc(Assembler::negative, L_done);
6847
6848 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6849
6850 Label L_second_loop;
6851 testl(kdx, kdx);
6852 jcc(Assembler::zero, L_second_loop);
6853
6854 Label L_carry;
6855 subl(kdx, 1);
6856 jcc(Assembler::zero, L_carry);
6857
6858 movl(Address(z, kdx, Address::times_4, 0), carry);
6859 shrq(carry, 32);
6860 subl(kdx, 1);
6861
6862 bind(L_carry);
6863 movl(Address(z, kdx, Address::times_4, 0), carry);
6864
6865 // Second and third (nested) loops.
6866 //
6867 // for (int i = xstart-1; i >= 0; i--) { // Second loop
6868 // carry = 0;
6869 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6870 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6871 // (z[k] & LONG_MASK) + carry;
6872 // z[k] = (int)product;
6873 // carry = product >>> 32;
6874 // }
6875 // z[i] = (int)carry;
6876 // }
6877 //
6878 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6879
6880 const Register jdx = tmp1;
6881
6882 bind(L_second_loop);
6883 xorl(carry, carry); // carry = 0;
6884 movl(jdx, ylen); // j = ystart+1
6885
6886 subl(xstart, 1); // i = xstart-1;
6887 jcc(Assembler::negative, L_done);
6888
6889 push (z);
6890
6891 Label L_last_x;
6892 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6893 subl(xstart, 1); // i = xstart-1;
6894 jcc(Assembler::negative, L_last_x);
6895
6896 if (UseBMI2Instructions) {
6897 movq(rdx, Address(x, xstart, Address::times_4, 0));
6898 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6899 } else {
6900 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6901 rorq(x_xstart, 32); // convert big-endian to little-endian
6902 }
6903
6904 Label L_third_loop_prologue;
6905 bind(L_third_loop_prologue);
6906
6907 push (x);
6908 push (xstart);
6909 push (ylen);
6910
6911
6912 if (UseBMI2Instructions) {
6913 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6914 } else { // !UseBMI2Instructions
6915 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6916 }
6917
6918 pop(ylen);
6919 pop(xlen);
6920 pop(x);
6921 pop(z);
6922
6923 movl(tmp3, xlen);
6924 addl(tmp3, 1);
6925 movl(Address(z, tmp3, Address::times_4, 0), carry);
6926 subl(tmp3, 1);
6927 jccb(Assembler::negative, L_done);
6928
6929 shrq(carry, 32);
6930 movl(Address(z, tmp3, Address::times_4, 0), carry);
6931 jmp(L_second_loop);
6932
6933 // Next infrequent code is moved outside loops.
6934 bind(L_last_x);
6935 if (UseBMI2Instructions) {
6936 movl(rdx, Address(x, 0));
6937 } else {
6938 movl(x_xstart, Address(x, 0));
6939 }
6940 jmp(L_third_loop_prologue);
6941
6942 bind(L_done);
6943
6944 pop(xlen);
6945
6946 pop(tmp5);
6947 pop(tmp4);
6948 pop(tmp3);
6949 pop(tmp2);
6950 pop(tmp1);
6951 pop(tmp0);
6952 }
6953
6954 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6955 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6956 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6957 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6958 Label VECTOR8_TAIL, VECTOR4_TAIL;
6959 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6960 Label SAME_TILL_END, DONE;
6961 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6962
6963 //scale is in rcx in both Win64 and Unix
6964 ShortBranchVerifier sbv(this);
6965
6966 shlq(length);
6967 xorq(result, result);
6968
6969 if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6970 VM_Version::supports_avx512vlbw()) {
6971 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6972
6973 cmpq(length, 64);
6974 jcc(Assembler::less, VECTOR32_TAIL);
6975
6976 movq(tmp1, length);
6977 andq(tmp1, 0x3F); // tail count
6978 andq(length, ~(0x3F)); //vector count
6979
6980 bind(VECTOR64_LOOP);
6981 // AVX512 code to compare 64 byte vectors.
6982 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
6983 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6984 kortestql(k7, k7);
6985 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
6986 addq(result, 64);
6987 subq(length, 64);
6988 jccb(Assembler::notZero, VECTOR64_LOOP);
6989
6990 //bind(VECTOR64_TAIL);
6991 testq(tmp1, tmp1);
6992 jcc(Assembler::zero, SAME_TILL_END);
6993
6994 //bind(VECTOR64_TAIL);
6995 // AVX512 code to compare up to 63 byte vectors.
6996 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6997 shlxq(tmp2, tmp2, tmp1);
6998 notq(tmp2);
6999 kmovql(k3, tmp2);
7000
7001 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
7002 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
7003
7004 ktestql(k7, k3);
7005 jcc(Assembler::below, SAME_TILL_END); // not mismatch
7006
7007 bind(VECTOR64_NOT_EQUAL);
7008 kmovql(tmp1, k7);
7009 notq(tmp1);
7010 tzcntq(tmp1, tmp1);
7011 addq(result, tmp1);
7012 shrq(result);
7013 jmp(DONE);
7014 bind(VECTOR32_TAIL);
7015 }
7016
7017 cmpq(length, 8);
7018 jcc(Assembler::equal, VECTOR8_LOOP);
7019 jcc(Assembler::less, VECTOR4_TAIL);
7020
7021 if (UseAVX >= 2) {
7022 Label VECTOR16_TAIL, VECTOR32_LOOP;
7023
7024 cmpq(length, 16);
7025 jcc(Assembler::equal, VECTOR16_LOOP);
7026 jcc(Assembler::less, VECTOR8_LOOP);
7027
7028 cmpq(length, 32);
7029 jccb(Assembler::less, VECTOR16_TAIL);
7030
7031 subq(length, 32);
7032 bind(VECTOR32_LOOP);
7033 vmovdqu(rymm0, Address(obja, result));
7034 vmovdqu(rymm1, Address(objb, result));
7035 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
7036 vptest(rymm2, rymm2);
7037 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
7038 addq(result, 32);
7039 subq(length, 32);
7040 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
7041 addq(length, 32);
7042 jcc(Assembler::equal, SAME_TILL_END);
7043 //falling through if less than 32 bytes left //close the branch here.
7044
7045 bind(VECTOR16_TAIL);
7046 cmpq(length, 16);
7047 jccb(Assembler::less, VECTOR8_TAIL);
7048 bind(VECTOR16_LOOP);
7049 movdqu(rymm0, Address(obja, result));
7050 movdqu(rymm1, Address(objb, result));
7051 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
7052 ptest(rymm2, rymm2);
7053 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7054 addq(result, 16);
7055 subq(length, 16);
7056 jcc(Assembler::equal, SAME_TILL_END);
7057 //falling through if less than 16 bytes left
7058 } else {//regular intrinsics
7059
7060 cmpq(length, 16);
7061 jccb(Assembler::less, VECTOR8_TAIL);
7062
7063 subq(length, 16);
7064 bind(VECTOR16_LOOP);
7065 movdqu(rymm0, Address(obja, result));
7066 movdqu(rymm1, Address(objb, result));
7067 pxor(rymm0, rymm1);
7068 ptest(rymm0, rymm0);
7069 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7070 addq(result, 16);
7071 subq(length, 16);
7072 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
7073 addq(length, 16);
7074 jcc(Assembler::equal, SAME_TILL_END);
7075 //falling through if less than 16 bytes left
7076 }
7077
7078 bind(VECTOR8_TAIL);
7079 cmpq(length, 8);
7080 jccb(Assembler::less, VECTOR4_TAIL);
7081 bind(VECTOR8_LOOP);
7082 movq(tmp1, Address(obja, result));
7083 movq(tmp2, Address(objb, result));
7084 xorq(tmp1, tmp2);
7085 testq(tmp1, tmp1);
7086 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
7087 addq(result, 8);
7088 subq(length, 8);
7089 jcc(Assembler::equal, SAME_TILL_END);
7090 //falling through if less than 8 bytes left
7091
7092 bind(VECTOR4_TAIL);
7093 cmpq(length, 4);
7094 jccb(Assembler::less, BYTES_TAIL);
7095 bind(VECTOR4_LOOP);
7096 movl(tmp1, Address(obja, result));
7097 xorl(tmp1, Address(objb, result));
7098 testl(tmp1, tmp1);
7099 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
7100 addq(result, 4);
7101 subq(length, 4);
7102 jcc(Assembler::equal, SAME_TILL_END);
7103 //falling through if less than 4 bytes left
7104
7105 bind(BYTES_TAIL);
7106 bind(BYTES_LOOP);
7107 load_unsigned_byte(tmp1, Address(obja, result));
7108 load_unsigned_byte(tmp2, Address(objb, result));
7109 xorl(tmp1, tmp2);
7110 testl(tmp1, tmp1);
7111 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7112 decq(length);
7113 jcc(Assembler::zero, SAME_TILL_END);
7114 incq(result);
7115 load_unsigned_byte(tmp1, Address(obja, result));
7116 load_unsigned_byte(tmp2, Address(objb, result));
7117 xorl(tmp1, tmp2);
7118 testl(tmp1, tmp1);
7119 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7120 decq(length);
7121 jcc(Assembler::zero, SAME_TILL_END);
7122 incq(result);
7123 load_unsigned_byte(tmp1, Address(obja, result));
7124 load_unsigned_byte(tmp2, Address(objb, result));
7125 xorl(tmp1, tmp2);
7126 testl(tmp1, tmp1);
7127 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7128 jmp(SAME_TILL_END);
7129
7130 if (UseAVX >= 2) {
7131 bind(VECTOR32_NOT_EQUAL);
7132 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
7133 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
7134 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
7135 vpmovmskb(tmp1, rymm0);
7136 bsfq(tmp1, tmp1);
7137 addq(result, tmp1);
7138 shrq(result);
7139 jmp(DONE);
7140 }
7141
7142 bind(VECTOR16_NOT_EQUAL);
7143 if (UseAVX >= 2) {
7144 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
7145 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
7146 pxor(rymm0, rymm2);
7147 } else {
7148 pcmpeqb(rymm2, rymm2);
7149 pxor(rymm0, rymm1);
7150 pcmpeqb(rymm0, rymm1);
7151 pxor(rymm0, rymm2);
7152 }
7153 pmovmskb(tmp1, rymm0);
7154 bsfq(tmp1, tmp1);
7155 addq(result, tmp1);
7156 shrq(result);
7157 jmpb(DONE);
7158
7159 bind(VECTOR8_NOT_EQUAL);
7160 bind(VECTOR4_NOT_EQUAL);
7161 bsfq(tmp1, tmp1);
7162 shrq(tmp1, 3);
7163 addq(result, tmp1);
7164 bind(BYTES_NOT_EQUAL);
7165 shrq(result);
7166 jmpb(DONE);
7167
7168 bind(SAME_TILL_END);
7169 mov64(result, -1);
7170
7171 bind(DONE);
7172 }
7173
7174 //Helper functions for square_to_len()
7175
7176 /**
7177 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7178 * Preserves x and z and modifies rest of the registers.
7179 */
7180 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7181 // Perform square and right shift by 1
7182 // Handle odd xlen case first, then for even xlen do the following
7183 // jlong carry = 0;
7184 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7185 // huge_128 product = x[j:j+1] * x[j:j+1];
7186 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7187 // z[i+2:i+3] = (jlong)(product >>> 1);
7188 // carry = (jlong)product;
7189 // }
7190
7191 xorq(tmp5, tmp5); // carry
7192 xorq(rdxReg, rdxReg);
7193 xorl(tmp1, tmp1); // index for x
7194 xorl(tmp4, tmp4); // index for z
7195
7196 Label L_first_loop, L_first_loop_exit;
7197
7198 testl(xlen, 1);
7199 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7200
7201 // Square and right shift by 1 the odd element using 32 bit multiply
7202 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7203 imulq(raxReg, raxReg);
7204 shrq(raxReg, 1);
7205 adcq(tmp5, 0);
7206 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7207 incrementl(tmp1);
7208 addl(tmp4, 2);
7209
7210 // Square and right shift by 1 the rest using 64 bit multiply
7211 bind(L_first_loop);
7212 cmpptr(tmp1, xlen);
7213 jccb(Assembler::equal, L_first_loop_exit);
7214
7215 // Square
7216 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
7217 rorq(raxReg, 32); // convert big-endian to little-endian
7218 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
7219
7220 // Right shift by 1 and save carry
7221 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7222 rcrq(rdxReg, 1);
7223 rcrq(raxReg, 1);
7224 adcq(tmp5, 0);
7225
7226 // Store result in z
7227 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7228 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7229
7230 // Update indices for x and z
7231 addl(tmp1, 2);
7232 addl(tmp4, 4);
7233 jmp(L_first_loop);
7234
7235 bind(L_first_loop_exit);
7236 }
7237
7238
7239 /**
7240 * Perform the following multiply add operation using BMI2 instructions
7241 * carry:sum = sum + op1*op2 + carry
7242 * op2 should be in rdx
7243 * op2 is preserved, all other registers are modified
7244 */
7245 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7246 // assert op2 is rdx
7247 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
7248 addq(sum, carry);
7249 adcq(tmp2, 0);
7250 addq(sum, op1);
7251 adcq(tmp2, 0);
7252 movq(carry, tmp2);
7253 }
7254
7255 /**
7256 * Perform the following multiply add operation:
7257 * carry:sum = sum + op1*op2 + carry
7258 * Preserves op1, op2 and modifies rest of registers
7259 */
7260 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7261 // rdx:rax = op1 * op2
7262 movq(raxReg, op2);
7263 mulq(op1);
7264
7265 // rdx:rax = sum + carry + rdx:rax
7266 addq(sum, carry);
7267 adcq(rdxReg, 0);
7268 addq(sum, raxReg);
7269 adcq(rdxReg, 0);
7270
7271 // carry:sum = rdx:sum
7272 movq(carry, rdxReg);
7273 }
7274
7275 /**
7276 * Add 64 bit long carry into z[] with carry propagation.
7277 * Preserves z and carry register values and modifies rest of registers.
7278 *
7279 */
7280 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7281 Label L_fourth_loop, L_fourth_loop_exit;
7282
7283 movl(tmp1, 1);
7284 subl(zlen, 2);
7285 addq(Address(z, zlen, Address::times_4, 0), carry);
7286
7287 bind(L_fourth_loop);
7288 jccb(Assembler::carryClear, L_fourth_loop_exit);
7289 subl(zlen, 2);
7290 jccb(Assembler::negative, L_fourth_loop_exit);
7291 addq(Address(z, zlen, Address::times_4, 0), tmp1);
7292 jmp(L_fourth_loop);
7293 bind(L_fourth_loop_exit);
7294 }
7295
7296 /**
7297 * Shift z[] left by 1 bit.
7298 * Preserves x, len, z and zlen registers and modifies rest of the registers.
7299 *
7300 */
7301 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7302
7303 Label L_fifth_loop, L_fifth_loop_exit;
7304
7305 // Fifth loop
7306 // Perform primitiveLeftShift(z, zlen, 1)
7307
7308 const Register prev_carry = tmp1;
7309 const Register new_carry = tmp4;
7310 const Register value = tmp2;
7311 const Register zidx = tmp3;
7312
7313 // int zidx, carry;
7314 // long value;
7315 // carry = 0;
7316 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7317 // (carry:value) = (z[i] << 1) | carry ;
7318 // z[i] = value;
7319 // }
7320
7321 movl(zidx, zlen);
7322 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7323
7324 bind(L_fifth_loop);
7325 decl(zidx); // Use decl to preserve carry flag
7326 decl(zidx);
7327 jccb(Assembler::negative, L_fifth_loop_exit);
7328
7329 if (UseBMI2Instructions) {
7330 movq(value, Address(z, zidx, Address::times_4, 0));
7331 rclq(value, 1);
7332 rorxq(value, value, 32);
7333 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7334 }
7335 else {
7336 // clear new_carry
7337 xorl(new_carry, new_carry);
7338
7339 // Shift z[i] by 1, or in previous carry and save new carry
7340 movq(value, Address(z, zidx, Address::times_4, 0));
7341 shlq(value, 1);
7342 adcl(new_carry, 0);
7343
7344 orq(value, prev_carry);
7345 rorq(value, 0x20);
7346 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7347
7348 // Set previous carry = new carry
7349 movl(prev_carry, new_carry);
7350 }
7351 jmp(L_fifth_loop);
7352
7353 bind(L_fifth_loop_exit);
7354 }
7355
7356
7357 /**
7358 * Code for BigInteger::squareToLen() intrinsic
7359 *
7360 * rdi: x
7361 * rsi: len
7362 * r8: z
7363 * rcx: zlen
7364 * r12: tmp1
7365 * r13: tmp2
7366 * r14: tmp3
7367 * r15: tmp4
7368 * rbx: tmp5
7369 *
7370 */
7371 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7372
7373 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
7374 push(tmp1);
7375 push(tmp2);
7376 push(tmp3);
7377 push(tmp4);
7378 push(tmp5);
7379
7380 // First loop
7381 // Store the squares, right shifted one bit (i.e., divided by 2).
7382 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7383
7384 // Add in off-diagonal sums.
7385 //
7386 // Second, third (nested) and fourth loops.
7387 // zlen +=2;
7388 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7389 // carry = 0;
7390 // long op2 = x[xidx:xidx+1];
7391 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7392 // k -= 2;
7393 // long op1 = x[j:j+1];
7394 // long sum = z[k:k+1];
7395 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7396 // z[k:k+1] = sum;
7397 // }
7398 // add_one_64(z, k, carry, tmp_regs);
7399 // }
7400
7401 const Register carry = tmp5;
7402 const Register sum = tmp3;
7403 const Register op1 = tmp4;
7404 Register op2 = tmp2;
7405
7406 push(zlen);
7407 push(len);
7408 addl(zlen,2);
7409 bind(L_second_loop);
7410 xorq(carry, carry);
7411 subl(zlen, 4);
7412 subl(len, 2);
7413 push(zlen);
7414 push(len);
7415 cmpl(len, 0);
7416 jccb(Assembler::lessEqual, L_second_loop_exit);
7417
7418 // Multiply an array by one 64 bit long.
7419 if (UseBMI2Instructions) {
7420 op2 = rdxReg;
7421 movq(op2, Address(x, len, Address::times_4, 0));
7422 rorxq(op2, op2, 32);
7423 }
7424 else {
7425 movq(op2, Address(x, len, Address::times_4, 0));
7426 rorq(op2, 32);
7427 }
7428
7429 bind(L_third_loop);
7430 decrementl(len);
7431 jccb(Assembler::negative, L_third_loop_exit);
7432 decrementl(len);
7433 jccb(Assembler::negative, L_last_x);
7434
7435 movq(op1, Address(x, len, Address::times_4, 0));
7436 rorq(op1, 32);
7437
7438 bind(L_multiply);
7439 subl(zlen, 2);
7440 movq(sum, Address(z, zlen, Address::times_4, 0));
7441
7442 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
7443 if (UseBMI2Instructions) {
7444 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
7445 }
7446 else {
7447 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7448 }
7449
7450 movq(Address(z, zlen, Address::times_4, 0), sum);
7451
7452 jmp(L_third_loop);
7453 bind(L_third_loop_exit);
7454
7455 // Fourth loop
7456 // Add 64 bit long carry into z with carry propagation.
7457 // Uses offsetted zlen.
7458 add_one_64(z, zlen, carry, tmp1);
7459
7460 pop(len);
7461 pop(zlen);
7462 jmp(L_second_loop);
7463
7464 // Next infrequent code is moved outside loops.
7465 bind(L_last_x);
7466 movl(op1, Address(x, 0));
7467 jmp(L_multiply);
7468
7469 bind(L_second_loop_exit);
7470 pop(len);
7471 pop(zlen);
7472 pop(len);
7473 pop(zlen);
7474
7475 // Fifth loop
7476 // Shift z left 1 bit.
7477 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7478
7479 // z[zlen-1] |= x[len-1] & 1;
7480 movl(tmp3, Address(x, len, Address::times_4, -4));
7481 andl(tmp3, 1);
7482 orl(Address(z, zlen, Address::times_4, -4), tmp3);
7483
7484 pop(tmp5);
7485 pop(tmp4);
7486 pop(tmp3);
7487 pop(tmp2);
7488 pop(tmp1);
7489 }
7490
7491 /**
7492 * Helper function for mul_add()
7493 * Multiply the in[] by int k and add to out[] starting at offset offs using
7494 * 128 bit by 32 bit multiply and return the carry in tmp5.
7495 * Only quad int aligned length of in[] is operated on in this function.
7496 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7497 * This function preserves out, in and k registers.
7498 * len and offset point to the appropriate index in "in" & "out" correspondingly
7499 * tmp5 has the carry.
7500 * other registers are temporary and are modified.
7501 *
7502 */
7503 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7504 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7505 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7506
7507 Label L_first_loop, L_first_loop_exit;
7508
7509 movl(tmp1, len);
7510 shrl(tmp1, 2);
7511
7512 bind(L_first_loop);
7513 subl(tmp1, 1);
7514 jccb(Assembler::negative, L_first_loop_exit);
7515
7516 subl(len, 4);
7517 subl(offset, 4);
7518
7519 Register op2 = tmp2;
7520 const Register sum = tmp3;
7521 const Register op1 = tmp4;
7522 const Register carry = tmp5;
7523
7524 if (UseBMI2Instructions) {
7525 op2 = rdxReg;
7526 }
7527
7528 movq(op1, Address(in, len, Address::times_4, 8));
7529 rorq(op1, 32);
7530 movq(sum, Address(out, offset, Address::times_4, 8));
7531 rorq(sum, 32);
7532 if (UseBMI2Instructions) {
7533 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7534 }
7535 else {
7536 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7537 }
7538 // Store back in big endian from little endian
7539 rorq(sum, 0x20);
7540 movq(Address(out, offset, Address::times_4, 8), sum);
7541
7542 movq(op1, Address(in, len, Address::times_4, 0));
7543 rorq(op1, 32);
7544 movq(sum, Address(out, offset, Address::times_4, 0));
7545 rorq(sum, 32);
7546 if (UseBMI2Instructions) {
7547 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7548 }
7549 else {
7550 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7551 }
7552 // Store back in big endian from little endian
7553 rorq(sum, 0x20);
7554 movq(Address(out, offset, Address::times_4, 0), sum);
7555
7556 jmp(L_first_loop);
7557 bind(L_first_loop_exit);
7558 }
7559
7560 /**
7561 * Code for BigInteger::mulAdd() intrinsic
7562 *
7563 * rdi: out
7564 * rsi: in
7565 * r11: offs (out.length - offset)
7566 * rcx: len
7567 * r8: k
7568 * r12: tmp1
7569 * r13: tmp2
7570 * r14: tmp3
7571 * r15: tmp4
7572 * rbx: tmp5
7573 * Multiply the in[] by word k and add to out[], return the carry in rax
7574 */
7575 void MacroAssembler::mul_add(Register out, Register in, Register offs,
7576 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
7577 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7578
7579 Label L_carry, L_last_in, L_done;
7580
7581 // carry = 0;
7582 // for (int j=len-1; j >= 0; j--) {
7583 // long product = (in[j] & LONG_MASK) * kLong +
7584 // (out[offs] & LONG_MASK) + carry;
7585 // out[offs--] = (int)product;
7586 // carry = product >>> 32;
7587 // }
7588 //
7589 push(tmp1);
7590 push(tmp2);
7591 push(tmp3);
7592 push(tmp4);
7593 push(tmp5);
7594
7595 Register op2 = tmp2;
7596 const Register sum = tmp3;
7597 const Register op1 = tmp4;
7598 const Register carry = tmp5;
7599
7600 if (UseBMI2Instructions) {
7601 op2 = rdxReg;
7602 movl(op2, k);
7603 }
7604 else {
7605 movl(op2, k);
7606 }
7607
7608 xorq(carry, carry);
7609
7610 //First loop
7611
7612 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7613 //The carry is in tmp5
7614 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7615
7616 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7617 decrementl(len);
7618 jccb(Assembler::negative, L_carry);
7619 decrementl(len);
7620 jccb(Assembler::negative, L_last_in);
7621
7622 movq(op1, Address(in, len, Address::times_4, 0));
7623 rorq(op1, 32);
7624
7625 subl(offs, 2);
7626 movq(sum, Address(out, offs, Address::times_4, 0));
7627 rorq(sum, 32);
7628
7629 if (UseBMI2Instructions) {
7630 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7631 }
7632 else {
7633 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7634 }
7635
7636 // Store back in big endian from little endian
7637 rorq(sum, 0x20);
7638 movq(Address(out, offs, Address::times_4, 0), sum);
7639
7640 testl(len, len);
7641 jccb(Assembler::zero, L_carry);
7642
7643 //Multiply the last in[] entry, if any
7644 bind(L_last_in);
7645 movl(op1, Address(in, 0));
7646 movl(sum, Address(out, offs, Address::times_4, -4));
7647
7648 movl(raxReg, k);
7649 mull(op1); //tmp4 * eax -> edx:eax
7650 addl(sum, carry);
7651 adcl(rdxReg, 0);
7652 addl(sum, raxReg);
7653 adcl(rdxReg, 0);
7654 movl(carry, rdxReg);
7655
7656 movl(Address(out, offs, Address::times_4, -4), sum);
7657
7658 bind(L_carry);
7659 //return tmp5/carry as carry in rax
7660 movl(rax, carry);
7661
7662 bind(L_done);
7663 pop(tmp5);
7664 pop(tmp4);
7665 pop(tmp3);
7666 pop(tmp2);
7667 pop(tmp1);
7668 }
7669
7670 /**
7671 * Emits code to update CRC-32 with a byte value according to constants in table
7672 *
7673 * @param [in,out]crc Register containing the crc.
7674 * @param [in]val Register containing the byte to fold into the CRC.
7675 * @param [in]table Register containing the table of crc constants.
7676 *
7677 * uint32_t crc;
7678 * val = crc_table[(val ^ crc) & 0xFF];
7679 * crc = val ^ (crc >> 8);
7680 *
7681 */
7682 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7683 xorl(val, crc);
7684 andl(val, 0xFF);
7685 shrl(crc, 8); // unsigned shift
7686 xorl(crc, Address(table, val, Address::times_4, 0));
7687 }
7688
7689 /**
7690 * Fold 128-bit data chunk
7691 */
7692 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7693 if (UseAVX > 0) {
7694 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7695 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7696 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7697 pxor(xcrc, xtmp);
7698 } else {
7699 movdqa(xtmp, xcrc);
7700 pclmulhdq(xtmp, xK); // [123:64]
7701 pclmulldq(xcrc, xK); // [63:0]
7702 pxor(xcrc, xtmp);
7703 movdqu(xtmp, Address(buf, offset));
7704 pxor(xcrc, xtmp);
7705 }
7706 }
7707
7708 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7709 if (UseAVX > 0) {
7710 vpclmulhdq(xtmp, xK, xcrc);
7711 vpclmulldq(xcrc, xK, xcrc);
7712 pxor(xcrc, xbuf);
7713 pxor(xcrc, xtmp);
7714 } else {
7715 movdqa(xtmp, xcrc);
7716 pclmulhdq(xtmp, xK);
7717 pclmulldq(xcrc, xK);
7718 pxor(xcrc, xbuf);
7719 pxor(xcrc, xtmp);
7720 }
7721 }
7722
7723 /**
7724 * 8-bit folds to compute 32-bit CRC
7725 *
7726 * uint64_t xcrc;
7727 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7728 */
7729 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7730 movdl(tmp, xcrc);
7731 andl(tmp, 0xFF);
7732 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7733 psrldq(xcrc, 1); // unsigned shift one byte
7734 pxor(xcrc, xtmp);
7735 }
7736
7737 /**
7738 * uint32_t crc;
7739 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7740 */
7741 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7742 movl(tmp, crc);
7743 andl(tmp, 0xFF);
7744 shrl(crc, 8);
7745 xorl(crc, Address(table, tmp, Address::times_4, 0));
7746 }
7747
7748 /**
7749 * @param crc register containing existing CRC (32-bit)
7750 * @param buf register pointing to input byte buffer (byte*)
7751 * @param len register containing number of bytes
7752 * @param table register that will contain address of CRC table
7753 * @param tmp scratch register
7754 */
7755 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7756 assert_different_registers(crc, buf, len, table, tmp, rax);
7757
7758 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7759 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7760
7761 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7762 // context for the registers used, where all instructions below are using 128-bit mode
7763 // On EVEX without VL and BW, these instructions will all be AVX.
7764 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7765 notl(crc); // ~crc
7766 cmpl(len, 16);
7767 jcc(Assembler::less, L_tail);
7768
7769 // Align buffer to 16 bytes
7770 movl(tmp, buf);
7771 andl(tmp, 0xF);
7772 jccb(Assembler::zero, L_aligned);
7773 subl(tmp, 16);
7774 addl(len, tmp);
7775
7776 align(4);
7777 BIND(L_align_loop);
7778 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7779 update_byte_crc32(crc, rax, table);
7780 increment(buf);
7781 incrementl(tmp);
7782 jccb(Assembler::less, L_align_loop);
7783
7784 BIND(L_aligned);
7785 movl(tmp, len); // save
7786 shrl(len, 4);
7787 jcc(Assembler::zero, L_tail_restore);
7788
7789 // Fold crc into first bytes of vector
7790 movdqa(xmm1, Address(buf, 0));
7791 movdl(rax, xmm1);
7792 xorl(crc, rax);
7793 if (VM_Version::supports_sse4_1()) {
7794 pinsrd(xmm1, crc, 0);
7795 } else {
7796 pinsrw(xmm1, crc, 0);
7797 shrl(crc, 16);
7798 pinsrw(xmm1, crc, 1);
7799 }
7800 addptr(buf, 16);
7801 subl(len, 4); // len > 0
7802 jcc(Assembler::less, L_fold_tail);
7803
7804 movdqa(xmm2, Address(buf, 0));
7805 movdqa(xmm3, Address(buf, 16));
7806 movdqa(xmm4, Address(buf, 32));
7807 addptr(buf, 48);
7808 subl(len, 3);
7809 jcc(Assembler::lessEqual, L_fold_512b);
7810
7811 // Fold total 512 bits of polynomial on each iteration,
7812 // 128 bits per each of 4 parallel streams.
7813 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
7814
7815 align32();
7816 BIND(L_fold_512b_loop);
7817 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7818 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7819 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7820 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7821 addptr(buf, 64);
7822 subl(len, 4);
7823 jcc(Assembler::greater, L_fold_512b_loop);
7824
7825 // Fold 512 bits to 128 bits.
7826 BIND(L_fold_512b);
7827 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7828 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7829 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7830 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7831
7832 // Fold the rest of 128 bits data chunks
7833 BIND(L_fold_tail);
7834 addl(len, 3);
7835 jccb(Assembler::lessEqual, L_fold_128b);
7836 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7837
7838 BIND(L_fold_tail_loop);
7839 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7840 addptr(buf, 16);
7841 decrementl(len);
7842 jccb(Assembler::greater, L_fold_tail_loop);
7843
7844 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7845 BIND(L_fold_128b);
7846 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
7847 if (UseAVX > 0) {
7848 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7849 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7850 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7851 } else {
7852 movdqa(xmm2, xmm0);
7853 pclmulqdq(xmm2, xmm1, 0x1);
7854 movdqa(xmm3, xmm0);
7855 pand(xmm3, xmm2);
7856 pclmulqdq(xmm0, xmm3, 0x1);
7857 }
7858 psrldq(xmm1, 8);
7859 psrldq(xmm2, 4);
7860 pxor(xmm0, xmm1);
7861 pxor(xmm0, xmm2);
7862
7863 // 8 8-bit folds to compute 32-bit CRC.
7864 for (int j = 0; j < 4; j++) {
7865 fold_8bit_crc32(xmm0, table, xmm1, rax);
7866 }
7867 movdl(crc, xmm0); // mov 32 bits to general register
7868 for (int j = 0; j < 4; j++) {
7869 fold_8bit_crc32(crc, table, rax);
7870 }
7871
7872 BIND(L_tail_restore);
7873 movl(len, tmp); // restore
7874 BIND(L_tail);
7875 andl(len, 0xf);
7876 jccb(Assembler::zero, L_exit);
7877
7878 // Fold the rest of bytes
7879 align(4);
7880 BIND(L_tail_loop);
7881 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7882 update_byte_crc32(crc, rax, table);
7883 increment(buf);
7884 decrementl(len);
7885 jccb(Assembler::greater, L_tail_loop);
7886
7887 BIND(L_exit);
7888 notl(crc); // ~c
7889 }
7890
7891 // Helper function for AVX 512 CRC32
7892 // Fold 512-bit data chunks
7893 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7894 Register pos, int offset) {
7895 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7896 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7897 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7898 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7899 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7900 }
7901
7902 // Helper function for AVX 512 CRC32
7903 // Compute CRC32 for < 256B buffers
7904 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7905 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7906 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7907
7908 Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7909 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7910 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7911
7912 // check if there is enough buffer to be able to fold 16B at a time
7913 cmpl(len, 32);
7914 jcc(Assembler::less, L_less_than_32);
7915
7916 // if there is, load the constants
7917 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
7918 movdl(xmm0, crc); // get the initial crc value
7919 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7920 pxor(xmm7, xmm0);
7921
7922 // update the buffer pointer
7923 addl(pos, 16);
7924 //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7925 subl(len, 32);
7926 jmp(L_16B_reduction_loop);
7927
7928 bind(L_less_than_32);
7929 //mov initial crc to the return value. this is necessary for zero - length buffers.
7930 movl(rax, crc);
7931 testl(len, len);
7932 jcc(Assembler::equal, L_cleanup);
7933
7934 movdl(xmm0, crc); //get the initial crc value
7935
7936 cmpl(len, 16);
7937 jcc(Assembler::equal, L_exact_16_left);
7938 jcc(Assembler::less, L_less_than_16_left);
7939
7940 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7941 pxor(xmm7, xmm0); //xor the initial crc value
7942 addl(pos, 16);
7943 subl(len, 16);
7944 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
7945 jmp(L_get_last_two_xmms);
7946
7947 bind(L_less_than_16_left);
7948 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7949 pxor(xmm1, xmm1);
7950 movptr(tmp1, rsp);
7951 movdqu(Address(tmp1, 0 * 16), xmm1);
7952
7953 cmpl(len, 4);
7954 jcc(Assembler::less, L_only_less_than_4);
7955
7956 //backup the counter value
7957 movl(tmp2, len);
7958 cmpl(len, 8);
7959 jcc(Assembler::less, L_less_than_8_left);
7960
7961 //load 8 Bytes
7962 movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7963 movq(Address(tmp1, 0 * 16), rax);
7964 addptr(tmp1, 8);
7965 subl(len, 8);
7966 addl(pos, 8);
7967
7968 bind(L_less_than_8_left);
7969 cmpl(len, 4);
7970 jcc(Assembler::less, L_less_than_4_left);
7971
7972 //load 4 Bytes
7973 movl(rax, Address(buf, pos, Address::times_1, 0));
7974 movl(Address(tmp1, 0 * 16), rax);
7975 addptr(tmp1, 4);
7976 subl(len, 4);
7977 addl(pos, 4);
7978
7979 bind(L_less_than_4_left);
7980 cmpl(len, 2);
7981 jcc(Assembler::less, L_less_than_2_left);
7982
7983 // load 2 Bytes
7984 movw(rax, Address(buf, pos, Address::times_1, 0));
7985 movl(Address(tmp1, 0 * 16), rax);
7986 addptr(tmp1, 2);
7987 subl(len, 2);
7988 addl(pos, 2);
7989
7990 bind(L_less_than_2_left);
7991 cmpl(len, 1);
7992 jcc(Assembler::less, L_zero_left);
7993
7994 // load 1 Byte
7995 movb(rax, Address(buf, pos, Address::times_1, 0));
7996 movb(Address(tmp1, 0 * 16), rax);
7997
7998 bind(L_zero_left);
7999 movdqu(xmm7, Address(rsp, 0));
8000 pxor(xmm7, xmm0); //xor the initial crc value
8001
8002 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8003 movdqu(xmm0, Address(rax, tmp2));
8004 pshufb(xmm7, xmm0);
8005 jmp(L_128_done);
8006
8007 bind(L_exact_16_left);
8008 movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
8009 pxor(xmm7, xmm0); //xor the initial crc value
8010 jmp(L_128_done);
8011
8012 bind(L_only_less_than_4);
8013 cmpl(len, 3);
8014 jcc(Assembler::less, L_only_less_than_3);
8015
8016 // load 3 Bytes
8017 movb(rax, Address(buf, pos, Address::times_1, 0));
8018 movb(Address(tmp1, 0), rax);
8019
8020 movb(rax, Address(buf, pos, Address::times_1, 1));
8021 movb(Address(tmp1, 1), rax);
8022
8023 movb(rax, Address(buf, pos, Address::times_1, 2));
8024 movb(Address(tmp1, 2), rax);
8025
8026 movdqu(xmm7, Address(rsp, 0));
8027 pxor(xmm7, xmm0); //xor the initial crc value
8028
8029 pslldq(xmm7, 0x5);
8030 jmp(L_barrett);
8031 bind(L_only_less_than_3);
8032 cmpl(len, 2);
8033 jcc(Assembler::less, L_only_less_than_2);
8034
8035 // load 2 Bytes
8036 movb(rax, Address(buf, pos, Address::times_1, 0));
8037 movb(Address(tmp1, 0), rax);
8038
8039 movb(rax, Address(buf, pos, Address::times_1, 1));
8040 movb(Address(tmp1, 1), rax);
8041
8042 movdqu(xmm7, Address(rsp, 0));
8043 pxor(xmm7, xmm0); //xor the initial crc value
8044
8045 pslldq(xmm7, 0x6);
8046 jmp(L_barrett);
8047
8048 bind(L_only_less_than_2);
8049 //load 1 Byte
8050 movb(rax, Address(buf, pos, Address::times_1, 0));
8051 movb(Address(tmp1, 0), rax);
8052
8053 movdqu(xmm7, Address(rsp, 0));
8054 pxor(xmm7, xmm0); //xor the initial crc value
8055
8056 pslldq(xmm7, 0x7);
8057 }
8058
8059 /**
8060 * Compute CRC32 using AVX512 instructions
8061 * param crc register containing existing CRC (32-bit)
8062 * param buf register pointing to input byte buffer (byte*)
8063 * param len register containing number of bytes
8064 * param table address of crc or crc32c table
8065 * param tmp1 scratch register
8066 * param tmp2 scratch register
8067 * return rax result register
8068 *
8069 * This routine is identical for crc32c with the exception of the precomputed constant
8070 * table which will be passed as the table argument. The calculation steps are
8071 * the same for both variants.
8072 */
8073 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
8074 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
8075
8076 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8077 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8078 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
8079 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
8080 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
8081
8082 const Register pos = r12;
8083 push(r12);
8084 subptr(rsp, 16 * 2 + 8);
8085
8086 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8087 // context for the registers used, where all instructions below are using 128-bit mode
8088 // On EVEX without VL and BW, these instructions will all be AVX.
8089 movl(pos, 0);
8090
8091 // check if smaller than 256B
8092 cmpl(len, 256);
8093 jcc(Assembler::less, L_less_than_256);
8094
8095 // load the initial crc value
8096 movdl(xmm10, crc);
8097
8098 // receive the initial 64B data, xor the initial crc value
8099 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
8100 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
8101 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
8102 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
8103
8104 subl(len, 256);
8105 cmpl(len, 256);
8106 jcc(Assembler::less, L_fold_128_B_loop);
8107
8108 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
8109 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
8110 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
8111 subl(len, 256);
8112
8113 bind(L_fold_256_B_loop);
8114 addl(pos, 256);
8115 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
8116 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
8117 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
8118 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
8119
8120 subl(len, 256);
8121 jcc(Assembler::greaterEqual, L_fold_256_B_loop);
8122
8123 // Fold 256 into 128
8124 addl(pos, 256);
8125 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
8126 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
8127 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
8128
8129 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
8130 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
8131 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
8132
8133 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
8134 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
8135
8136 addl(len, 128);
8137 jmp(L_fold_128_B_register);
8138
8139 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
8140 // loop will fold 128B at a time until we have 128 + y Bytes of buffer
8141
8142 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
8143 bind(L_fold_128_B_loop);
8144 addl(pos, 128);
8145 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
8146 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
8147
8148 subl(len, 128);
8149 jcc(Assembler::greaterEqual, L_fold_128_B_loop);
8150
8151 addl(pos, 128);
8152
8153 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
8154 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
8155 bind(L_fold_128_B_register);
8156 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
8157 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
8158 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
8159 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
8160 // save last that has no multiplicand
8161 vextracti64x2(xmm7, xmm4, 3);
8162
8163 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
8164 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
8165 // Needed later in reduction loop
8166 movdqu(xmm10, Address(table, 1 * 16));
8167 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
8168 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
8169
8170 // Swap 1,0,3,2 - 01 00 11 10
8171 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
8172 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
8173 vextracti128(xmm5, xmm8, 1);
8174 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
8175
8176 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
8177 // instead of a cmp instruction, we use the negative flag with the jl instruction
8178 addl(len, 128 - 16);
8179 jcc(Assembler::less, L_final_reduction_for_128);
8180
8181 bind(L_16B_reduction_loop);
8182 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8183 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8184 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8185 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
8186 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8187 addl(pos, 16);
8188 subl(len, 16);
8189 jcc(Assembler::greaterEqual, L_16B_reduction_loop);
8190
8191 bind(L_final_reduction_for_128);
8192 addl(len, 16);
8193 jcc(Assembler::equal, L_128_done);
8194
8195 bind(L_get_last_two_xmms);
8196 movdqu(xmm2, xmm7);
8197 addl(pos, len);
8198 movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
8199 subl(pos, len);
8200
8201 // get rid of the extra data that was loaded before
8202 // load the shift constant
8203 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8204 movdqu(xmm0, Address(rax, len));
8205 addl(rax, len);
8206
8207 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8208 //Change mask to 512
8209 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
8210 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
8211
8212 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
8213 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8214 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8215 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8216 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
8217
8218 bind(L_128_done);
8219 // compute crc of a 128-bit value
8220 movdqu(xmm10, Address(table, 3 * 16));
8221 movdqu(xmm0, xmm7);
8222
8223 // 64b fold
8224 vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
8225 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
8226 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8227
8228 // 32b fold
8229 movdqu(xmm0, xmm7);
8230 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
8231 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8232 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8233 jmp(L_barrett);
8234
8235 bind(L_less_than_256);
8236 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
8237
8238 //barrett reduction
8239 bind(L_barrett);
8240 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
8241 movdqu(xmm1, xmm7);
8242 movdqu(xmm2, xmm7);
8243 movdqu(xmm10, Address(table, 4 * 16));
8244
8245 pclmulqdq(xmm7, xmm10, 0x0);
8246 pxor(xmm7, xmm2);
8247 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
8248 movdqu(xmm2, xmm7);
8249 pclmulqdq(xmm7, xmm10, 0x10);
8250 pxor(xmm7, xmm2);
8251 pxor(xmm7, xmm1);
8252 pextrd(crc, xmm7, 2);
8253
8254 bind(L_cleanup);
8255 addptr(rsp, 16 * 2 + 8);
8256 pop(r12);
8257 }
8258
8259 // S. Gueron / Information Processing Letters 112 (2012) 184
8260 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8261 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8262 // Output: the 64-bit carry-less product of B * CONST
8263 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8264 Register tmp1, Register tmp2, Register tmp3) {
8265 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8266 if (n > 0) {
8267 addq(tmp3, n * 256 * 8);
8268 }
8269 // Q1 = TABLEExt[n][B & 0xFF];
8270 movl(tmp1, in);
8271 andl(tmp1, 0x000000FF);
8272 shll(tmp1, 3);
8273 addq(tmp1, tmp3);
8274 movq(tmp1, Address(tmp1, 0));
8275
8276 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
8277 movl(tmp2, in);
8278 shrl(tmp2, 8);
8279 andl(tmp2, 0x000000FF);
8280 shll(tmp2, 3);
8281 addq(tmp2, tmp3);
8282 movq(tmp2, Address(tmp2, 0));
8283
8284 shlq(tmp2, 8);
8285 xorq(tmp1, tmp2);
8286
8287 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
8288 movl(tmp2, in);
8289 shrl(tmp2, 16);
8290 andl(tmp2, 0x000000FF);
8291 shll(tmp2, 3);
8292 addq(tmp2, tmp3);
8293 movq(tmp2, Address(tmp2, 0));
8294
8295 shlq(tmp2, 16);
8296 xorq(tmp1, tmp2);
8297
8298 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
8299 shrl(in, 24);
8300 andl(in, 0x000000FF);
8301 shll(in, 3);
8302 addq(in, tmp3);
8303 movq(in, Address(in, 0));
8304
8305 shlq(in, 24);
8306 xorq(in, tmp1);
8307 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8308 }
8309
8310 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8311 Register in_out,
8312 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8313 XMMRegister w_xtmp2,
8314 Register tmp1,
8315 Register n_tmp2, Register n_tmp3) {
8316 if (is_pclmulqdq_supported) {
8317 movdl(w_xtmp1, in_out); // modified blindly
8318
8319 movl(tmp1, const_or_pre_comp_const_index);
8320 movdl(w_xtmp2, tmp1);
8321 pclmulqdq(w_xtmp1, w_xtmp2, 0);
8322
8323 movdq(in_out, w_xtmp1);
8324 } else {
8325 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8326 }
8327 }
8328
8329 // Recombination Alternative 2: No bit-reflections
8330 // T1 = (CRC_A * U1) << 1
8331 // T2 = (CRC_B * U2) << 1
8332 // C1 = T1 >> 32
8333 // C2 = T2 >> 32
8334 // T1 = T1 & 0xFFFFFFFF
8335 // T2 = T2 & 0xFFFFFFFF
8336 // T1 = CRC32(0, T1)
8337 // T2 = CRC32(0, T2)
8338 // C1 = C1 ^ T1
8339 // C2 = C2 ^ T2
8340 // CRC = C1 ^ C2 ^ CRC_C
8341 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8342 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8343 Register tmp1, Register tmp2,
8344 Register n_tmp3) {
8345 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8346 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8347 shlq(in_out, 1);
8348 movl(tmp1, in_out);
8349 shrq(in_out, 32);
8350 xorl(tmp2, tmp2);
8351 crc32(tmp2, tmp1, 4);
8352 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8353 shlq(in1, 1);
8354 movl(tmp1, in1);
8355 shrq(in1, 32);
8356 xorl(tmp2, tmp2);
8357 crc32(tmp2, tmp1, 4);
8358 xorl(in1, tmp2);
8359 xorl(in_out, in1);
8360 xorl(in_out, in2);
8361 }
8362
8363 // Set N to predefined value
8364 // Subtract from a length of a buffer
8365 // execute in a loop:
8366 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8367 // for i = 1 to N do
8368 // CRC_A = CRC32(CRC_A, A[i])
8369 // CRC_B = CRC32(CRC_B, B[i])
8370 // CRC_C = CRC32(CRC_C, C[i])
8371 // end for
8372 // Recombine
8373 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8374 Register in_out1, Register in_out2, Register in_out3,
8375 Register tmp1, Register tmp2, Register tmp3,
8376 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8377 Register tmp4, Register tmp5,
8378 Register n_tmp6) {
8379 Label L_processPartitions;
8380 Label L_processPartition;
8381 Label L_exit;
8382
8383 bind(L_processPartitions);
8384 cmpl(in_out1, 3 * size);
8385 jcc(Assembler::less, L_exit);
8386 xorl(tmp1, tmp1);
8387 xorl(tmp2, tmp2);
8388 movq(tmp3, in_out2);
8389 addq(tmp3, size);
8390
8391 bind(L_processPartition);
8392 crc32(in_out3, Address(in_out2, 0), 8);
8393 crc32(tmp1, Address(in_out2, size), 8);
8394 crc32(tmp2, Address(in_out2, size * 2), 8);
8395 addq(in_out2, 8);
8396 cmpq(in_out2, tmp3);
8397 jcc(Assembler::less, L_processPartition);
8398 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8399 w_xtmp1, w_xtmp2, w_xtmp3,
8400 tmp4, tmp5,
8401 n_tmp6);
8402 addq(in_out2, 2 * size);
8403 subl(in_out1, 3 * size);
8404 jmp(L_processPartitions);
8405
8406 bind(L_exit);
8407 }
8408
8409 // Algorithm 2: Pipelined usage of the CRC32 instruction.
8410 // Input: A buffer I of L bytes.
8411 // Output: the CRC32C value of the buffer.
8412 // Notations:
8413 // Write L = 24N + r, with N = floor (L/24).
8414 // r = L mod 24 (0 <= r < 24).
8415 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8416 // N quadwords, and R consists of r bytes.
8417 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8418 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8419 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8420 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8421 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8422 Register tmp1, Register tmp2, Register tmp3,
8423 Register tmp4, Register tmp5, Register tmp6,
8424 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8425 bool is_pclmulqdq_supported) {
8426 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8427 Label L_wordByWord;
8428 Label L_byteByByteProlog;
8429 Label L_byteByByte;
8430 Label L_exit;
8431
8432 if (is_pclmulqdq_supported ) {
8433 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr();
8434 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1);
8435
8436 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2);
8437 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3);
8438
8439 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4);
8440 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5);
8441 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8442 } else {
8443 const_or_pre_comp_const_index[0] = 1;
8444 const_or_pre_comp_const_index[1] = 0;
8445
8446 const_or_pre_comp_const_index[2] = 3;
8447 const_or_pre_comp_const_index[3] = 2;
8448
8449 const_or_pre_comp_const_index[4] = 5;
8450 const_or_pre_comp_const_index[5] = 4;
8451 }
8452 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8453 in2, in1, in_out,
8454 tmp1, tmp2, tmp3,
8455 w_xtmp1, w_xtmp2, w_xtmp3,
8456 tmp4, tmp5,
8457 tmp6);
8458 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8459 in2, in1, in_out,
8460 tmp1, tmp2, tmp3,
8461 w_xtmp1, w_xtmp2, w_xtmp3,
8462 tmp4, tmp5,
8463 tmp6);
8464 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8465 in2, in1, in_out,
8466 tmp1, tmp2, tmp3,
8467 w_xtmp1, w_xtmp2, w_xtmp3,
8468 tmp4, tmp5,
8469 tmp6);
8470 movl(tmp1, in2);
8471 andl(tmp1, 0x00000007);
8472 negl(tmp1);
8473 addl(tmp1, in2);
8474 addq(tmp1, in1);
8475
8476 cmpq(in1, tmp1);
8477 jccb(Assembler::greaterEqual, L_byteByByteProlog);
8478 align(16);
8479 BIND(L_wordByWord);
8480 crc32(in_out, Address(in1, 0), 8);
8481 addq(in1, 8);
8482 cmpq(in1, tmp1);
8483 jcc(Assembler::less, L_wordByWord);
8484
8485 BIND(L_byteByByteProlog);
8486 andl(in2, 0x00000007);
8487 movl(tmp2, 1);
8488
8489 cmpl(tmp2, in2);
8490 jccb(Assembler::greater, L_exit);
8491 BIND(L_byteByByte);
8492 crc32(in_out, Address(in1, 0), 1);
8493 incq(in1);
8494 incl(tmp2);
8495 cmpl(tmp2, in2);
8496 jcc(Assembler::lessEqual, L_byteByByte);
8497
8498 BIND(L_exit);
8499 }
8500 #undef BIND
8501 #undef BLOCK_COMMENT
8502
8503 // Compress char[] array to byte[].
8504 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
8505 // Return the array length if every element in array can be encoded,
8506 // otherwise, the index of first non-latin1 (> 0xff) character.
8507 // @IntrinsicCandidate
8508 // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8509 // for (int i = 0; i < len; i++) {
8510 // char c = src[srcOff];
8511 // if (c > 0xff) {
8512 // return i; // return index of non-latin1 char
8513 // }
8514 // dst[dstOff] = (byte)c;
8515 // srcOff++;
8516 // dstOff++;
8517 // }
8518 // return len;
8519 // }
8520 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8521 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8522 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8523 Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8524 Label copy_chars_loop, done, reset_sp, copy_tail;
8525
8526 // rsi: src
8527 // rdi: dst
8528 // rdx: len
8529 // rcx: tmp5
8530 // rax: result
8531
8532 // rsi holds start addr of source char[] to be compressed
8533 // rdi holds start addr of destination byte[]
8534 // rdx holds length
8535
8536 assert(len != result, "");
8537
8538 // save length for return
8539 movl(result, len);
8540
8541 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8542 VM_Version::supports_avx512vlbw() &&
8543 VM_Version::supports_bmi2()) {
8544
8545 Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
8546
8547 // alignment
8548 Label post_alignment;
8549
8550 // if length of the string is less than 32, handle it the old fashioned way
8551 testl(len, -32);
8552 jcc(Assembler::zero, below_threshold);
8553
8554 // First check whether a character is compressible ( <= 0xFF).
8555 // Create mask to test for Unicode chars inside zmm vector
8556 movl(tmp5, 0x00FF);
8557 evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
8558
8559 testl(len, -64);
8560 jccb(Assembler::zero, post_alignment);
8561
8562 movl(tmp5, dst);
8563 andl(tmp5, (32 - 1));
8564 negl(tmp5);
8565 andl(tmp5, (32 - 1));
8566
8567 // bail out when there is nothing to be done
8568 testl(tmp5, 0xFFFFFFFF);
8569 jccb(Assembler::zero, post_alignment);
8570
8571 // ~(~0 << len), where len is the # of remaining elements to process
8572 movl(len, 0xFFFFFFFF);
8573 shlxl(len, len, tmp5);
8574 notl(len);
8575 kmovdl(mask2, len);
8576 movl(len, result);
8577
8578 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8579 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8580 ktestd(mask1, mask2);
8581 jcc(Assembler::carryClear, copy_tail);
8582
8583 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8584
8585 addptr(src, tmp5);
8586 addptr(src, tmp5);
8587 addptr(dst, tmp5);
8588 subl(len, tmp5);
8589
8590 bind(post_alignment);
8591 // end of alignment
8592
8593 movl(tmp5, len);
8594 andl(tmp5, (32 - 1)); // tail count (in chars)
8595 andl(len, ~(32 - 1)); // vector count (in chars)
8596 jccb(Assembler::zero, copy_loop_tail);
8597
8598 lea(src, Address(src, len, Address::times_2));
8599 lea(dst, Address(dst, len, Address::times_1));
8600 negptr(len);
8601
8602 bind(copy_32_loop);
8603 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
8604 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8605 kortestdl(mask1, mask1);
8606 jccb(Assembler::carryClear, reset_for_copy_tail);
8607
8608 // All elements in current processed chunk are valid candidates for
8609 // compression. Write a truncated byte elements to the memory.
8610 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8611 addptr(len, 32);
8612 jccb(Assembler::notZero, copy_32_loop);
8613
8614 bind(copy_loop_tail);
8615 // bail out when there is nothing to be done
8616 testl(tmp5, 0xFFFFFFFF);
8617 jcc(Assembler::zero, done);
8618
8619 movl(len, tmp5);
8620
8621 // ~(~0 << len), where len is the # of remaining elements to process
8622 movl(tmp5, 0xFFFFFFFF);
8623 shlxl(tmp5, tmp5, len);
8624 notl(tmp5);
8625
8626 kmovdl(mask2, tmp5);
8627
8628 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8629 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8630 ktestd(mask1, mask2);
8631 jcc(Assembler::carryClear, copy_tail);
8632
8633 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8634 jmp(done);
8635
8636 bind(reset_for_copy_tail);
8637 lea(src, Address(src, tmp5, Address::times_2));
8638 lea(dst, Address(dst, tmp5, Address::times_1));
8639 subptr(len, tmp5);
8640 jmp(copy_chars_loop);
8641
8642 bind(below_threshold);
8643 }
8644
8645 if (UseSSE42Intrinsics) {
8646 Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
8647
8648 // vectored compression
8649 testl(len, 0xfffffff8);
8650 jcc(Assembler::zero, copy_tail);
8651
8652 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
8653 movdl(tmp1Reg, tmp5);
8654 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
8655
8656 andl(len, 0xfffffff0);
8657 jccb(Assembler::zero, copy_16);
8658
8659 // compress 16 chars per iter
8660 pxor(tmp4Reg, tmp4Reg);
8661
8662 lea(src, Address(src, len, Address::times_2));
8663 lea(dst, Address(dst, len, Address::times_1));
8664 negptr(len);
8665
8666 bind(copy_32_loop);
8667 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
8668 por(tmp4Reg, tmp2Reg);
8669 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8670 por(tmp4Reg, tmp3Reg);
8671 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
8672 jccb(Assembler::notZero, reset_for_copy_tail);
8673 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
8674 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8675 addptr(len, 16);
8676 jccb(Assembler::notZero, copy_32_loop);
8677
8678 // compress next vector of 8 chars (if any)
8679 bind(copy_16);
8680 // len = 0
8681 testl(result, 0x00000008); // check if there's a block of 8 chars to compress
8682 jccb(Assembler::zero, copy_tail_sse);
8683
8684 pxor(tmp3Reg, tmp3Reg);
8685
8686 movdqu(tmp2Reg, Address(src, 0));
8687 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8688 jccb(Assembler::notZero, reset_for_copy_tail);
8689 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
8690 movq(Address(dst, 0), tmp2Reg);
8691 addptr(src, 16);
8692 addptr(dst, 8);
8693 jmpb(copy_tail_sse);
8694
8695 bind(reset_for_copy_tail);
8696 movl(tmp5, result);
8697 andl(tmp5, 0x0000000f);
8698 lea(src, Address(src, tmp5, Address::times_2));
8699 lea(dst, Address(dst, tmp5, Address::times_1));
8700 subptr(len, tmp5);
8701 jmpb(copy_chars_loop);
8702
8703 bind(copy_tail_sse);
8704 movl(len, result);
8705 andl(len, 0x00000007); // tail count (in chars)
8706 }
8707 // compress 1 char per iter
8708 bind(copy_tail);
8709 testl(len, len);
8710 jccb(Assembler::zero, done);
8711 lea(src, Address(src, len, Address::times_2));
8712 lea(dst, Address(dst, len, Address::times_1));
8713 negptr(len);
8714
8715 bind(copy_chars_loop);
8716 load_unsigned_short(tmp5, Address(src, len, Address::times_2));
8717 testl(tmp5, 0xff00); // check if Unicode char
8718 jccb(Assembler::notZero, reset_sp);
8719 movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte
8720 increment(len);
8721 jccb(Assembler::notZero, copy_chars_loop);
8722
8723 // add len then return (len will be zero if compress succeeded, otherwise negative)
8724 bind(reset_sp);
8725 addl(result, len);
8726
8727 bind(done);
8728 }
8729
8730 // Inflate byte[] array to char[].
8731 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8732 // @IntrinsicCandidate
8733 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8734 // for (int i = 0; i < len; i++) {
8735 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8736 // }
8737 // }
8738 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8739 XMMRegister tmp1, Register tmp2, KRegister mask) {
8740 Label copy_chars_loop, done, below_threshold, avx3_threshold;
8741 // rsi: src
8742 // rdi: dst
8743 // rdx: len
8744 // rcx: tmp2
8745
8746 // rsi holds start addr of source byte[] to be inflated
8747 // rdi holds start addr of destination char[]
8748 // rdx holds length
8749 assert_different_registers(src, dst, len, tmp2);
8750 movl(tmp2, len);
8751 if ((UseAVX > 2) && // AVX512
8752 VM_Version::supports_avx512vlbw() &&
8753 VM_Version::supports_bmi2()) {
8754
8755 Label copy_32_loop, copy_tail;
8756 Register tmp3_aliased = len;
8757
8758 // if length of the string is less than 16, handle it in an old fashioned way
8759 testl(len, -16);
8760 jcc(Assembler::zero, below_threshold);
8761
8762 testl(len, -1 * AVX3Threshold);
8763 jcc(Assembler::zero, avx3_threshold);
8764
8765 // In order to use only one arithmetic operation for the main loop we use
8766 // this pre-calculation
8767 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8768 andl(len, -32); // vector count
8769 jccb(Assembler::zero, copy_tail);
8770
8771 lea(src, Address(src, len, Address::times_1));
8772 lea(dst, Address(dst, len, Address::times_2));
8773 negptr(len);
8774
8775
8776 // inflate 32 chars per iter
8777 bind(copy_32_loop);
8778 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8779 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
8780 addptr(len, 32);
8781 jcc(Assembler::notZero, copy_32_loop);
8782
8783 bind(copy_tail);
8784 // bail out when there is nothing to be done
8785 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8786 jcc(Assembler::zero, done);
8787
8788 // ~(~0 << length), where length is the # of remaining elements to process
8789 movl(tmp3_aliased, -1);
8790 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8791 notl(tmp3_aliased);
8792 kmovdl(mask, tmp3_aliased);
8793 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8794 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8795
8796 jmp(done);
8797 bind(avx3_threshold);
8798 }
8799 if (UseSSE42Intrinsics) {
8800 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8801
8802 if (UseAVX > 1) {
8803 andl(tmp2, (16 - 1));
8804 andl(len, -16);
8805 jccb(Assembler::zero, copy_new_tail);
8806 } else {
8807 andl(tmp2, 0x00000007); // tail count (in chars)
8808 andl(len, 0xfffffff8); // vector count (in chars)
8809 jccb(Assembler::zero, copy_tail);
8810 }
8811
8812 // vectored inflation
8813 lea(src, Address(src, len, Address::times_1));
8814 lea(dst, Address(dst, len, Address::times_2));
8815 negptr(len);
8816
8817 if (UseAVX > 1) {
8818 bind(copy_16_loop);
8819 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8820 vmovdqu(Address(dst, len, Address::times_2), tmp1);
8821 addptr(len, 16);
8822 jcc(Assembler::notZero, copy_16_loop);
8823
8824 bind(below_threshold);
8825 bind(copy_new_tail);
8826 movl(len, tmp2);
8827 andl(tmp2, 0x00000007);
8828 andl(len, 0xFFFFFFF8);
8829 jccb(Assembler::zero, copy_tail);
8830
8831 pmovzxbw(tmp1, Address(src, 0));
8832 movdqu(Address(dst, 0), tmp1);
8833 addptr(src, 8);
8834 addptr(dst, 2 * 8);
8835
8836 jmp(copy_tail, true);
8837 }
8838
8839 // inflate 8 chars per iter
8840 bind(copy_8_loop);
8841 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
8842 movdqu(Address(dst, len, Address::times_2), tmp1);
8843 addptr(len, 8);
8844 jcc(Assembler::notZero, copy_8_loop);
8845
8846 bind(copy_tail);
8847 movl(len, tmp2);
8848
8849 cmpl(len, 4);
8850 jccb(Assembler::less, copy_bytes);
8851
8852 movdl(tmp1, Address(src, 0)); // load 4 byte chars
8853 pmovzxbw(tmp1, tmp1);
8854 movq(Address(dst, 0), tmp1);
8855 subptr(len, 4);
8856 addptr(src, 4);
8857 addptr(dst, 8);
8858
8859 bind(copy_bytes);
8860 } else {
8861 bind(below_threshold);
8862 }
8863
8864 testl(len, len);
8865 jccb(Assembler::zero, done);
8866 lea(src, Address(src, len, Address::times_1));
8867 lea(dst, Address(dst, len, Address::times_2));
8868 negptr(len);
8869
8870 // inflate 1 char per iter
8871 bind(copy_chars_loop);
8872 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
8873 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
8874 increment(len);
8875 jcc(Assembler::notZero, copy_chars_loop);
8876
8877 bind(done);
8878 }
8879
8880 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
8881 switch(type) {
8882 case T_BYTE:
8883 case T_BOOLEAN:
8884 evmovdqub(dst, kmask, src, merge, vector_len);
8885 break;
8886 case T_CHAR:
8887 case T_SHORT:
8888 evmovdquw(dst, kmask, src, merge, vector_len);
8889 break;
8890 case T_INT:
8891 case T_FLOAT:
8892 evmovdqul(dst, kmask, src, merge, vector_len);
8893 break;
8894 case T_LONG:
8895 case T_DOUBLE:
8896 evmovdquq(dst, kmask, src, merge, vector_len);
8897 break;
8898 default:
8899 fatal("Unexpected type argument %s", type2name(type));
8900 break;
8901 }
8902 }
8903
8904
8905 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
8906 switch(type) {
8907 case T_BYTE:
8908 case T_BOOLEAN:
8909 evmovdqub(dst, kmask, src, merge, vector_len);
8910 break;
8911 case T_CHAR:
8912 case T_SHORT:
8913 evmovdquw(dst, kmask, src, merge, vector_len);
8914 break;
8915 case T_INT:
8916 case T_FLOAT:
8917 evmovdqul(dst, kmask, src, merge, vector_len);
8918 break;
8919 case T_LONG:
8920 case T_DOUBLE:
8921 evmovdquq(dst, kmask, src, merge, vector_len);
8922 break;
8923 default:
8924 fatal("Unexpected type argument %s", type2name(type));
8925 break;
8926 }
8927 }
8928
8929 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
8930 switch(type) {
8931 case T_BYTE:
8932 case T_BOOLEAN:
8933 evmovdqub(dst, kmask, src, merge, vector_len);
8934 break;
8935 case T_CHAR:
8936 case T_SHORT:
8937 evmovdquw(dst, kmask, src, merge, vector_len);
8938 break;
8939 case T_INT:
8940 case T_FLOAT:
8941 evmovdqul(dst, kmask, src, merge, vector_len);
8942 break;
8943 case T_LONG:
8944 case T_DOUBLE:
8945 evmovdquq(dst, kmask, src, merge, vector_len);
8946 break;
8947 default:
8948 fatal("Unexpected type argument %s", type2name(type));
8949 break;
8950 }
8951 }
8952
8953 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8954 switch(masklen) {
8955 case 2:
8956 knotbl(dst, src);
8957 movl(rtmp, 3);
8958 kmovbl(ktmp, rtmp);
8959 kandbl(dst, ktmp, dst);
8960 break;
8961 case 4:
8962 knotbl(dst, src);
8963 movl(rtmp, 15);
8964 kmovbl(ktmp, rtmp);
8965 kandbl(dst, ktmp, dst);
8966 break;
8967 case 8:
8968 knotbl(dst, src);
8969 break;
8970 case 16:
8971 knotwl(dst, src);
8972 break;
8973 case 32:
8974 knotdl(dst, src);
8975 break;
8976 case 64:
8977 knotql(dst, src);
8978 break;
8979 default:
8980 fatal("Unexpected vector length %d", masklen);
8981 break;
8982 }
8983 }
8984
8985 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8986 switch(type) {
8987 case T_BOOLEAN:
8988 case T_BYTE:
8989 kandbl(dst, src1, src2);
8990 break;
8991 case T_CHAR:
8992 case T_SHORT:
8993 kandwl(dst, src1, src2);
8994 break;
8995 case T_INT:
8996 case T_FLOAT:
8997 kanddl(dst, src1, src2);
8998 break;
8999 case T_LONG:
9000 case T_DOUBLE:
9001 kandql(dst, src1, src2);
9002 break;
9003 default:
9004 fatal("Unexpected type argument %s", type2name(type));
9005 break;
9006 }
9007 }
9008
9009 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9010 switch(type) {
9011 case T_BOOLEAN:
9012 case T_BYTE:
9013 korbl(dst, src1, src2);
9014 break;
9015 case T_CHAR:
9016 case T_SHORT:
9017 korwl(dst, src1, src2);
9018 break;
9019 case T_INT:
9020 case T_FLOAT:
9021 kordl(dst, src1, src2);
9022 break;
9023 case T_LONG:
9024 case T_DOUBLE:
9025 korql(dst, src1, src2);
9026 break;
9027 default:
9028 fatal("Unexpected type argument %s", type2name(type));
9029 break;
9030 }
9031 }
9032
9033 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9034 switch(type) {
9035 case T_BOOLEAN:
9036 case T_BYTE:
9037 kxorbl(dst, src1, src2);
9038 break;
9039 case T_CHAR:
9040 case T_SHORT:
9041 kxorwl(dst, src1, src2);
9042 break;
9043 case T_INT:
9044 case T_FLOAT:
9045 kxordl(dst, src1, src2);
9046 break;
9047 case T_LONG:
9048 case T_DOUBLE:
9049 kxorql(dst, src1, src2);
9050 break;
9051 default:
9052 fatal("Unexpected type argument %s", type2name(type));
9053 break;
9054 }
9055 }
9056
9057 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9058 switch(type) {
9059 case T_BOOLEAN:
9060 case T_BYTE:
9061 evpermb(dst, mask, nds, src, merge, vector_len); break;
9062 case T_CHAR:
9063 case T_SHORT:
9064 evpermw(dst, mask, nds, src, merge, vector_len); break;
9065 case T_INT:
9066 case T_FLOAT:
9067 evpermd(dst, mask, nds, src, merge, vector_len); break;
9068 case T_LONG:
9069 case T_DOUBLE:
9070 evpermq(dst, mask, nds, src, merge, vector_len); break;
9071 default:
9072 fatal("Unexpected type argument %s", type2name(type)); break;
9073 }
9074 }
9075
9076 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9077 switch(type) {
9078 case T_BOOLEAN:
9079 case T_BYTE:
9080 evpermb(dst, mask, nds, src, merge, vector_len); break;
9081 case T_CHAR:
9082 case T_SHORT:
9083 evpermw(dst, mask, nds, src, merge, vector_len); break;
9084 case T_INT:
9085 case T_FLOAT:
9086 evpermd(dst, mask, nds, src, merge, vector_len); break;
9087 case T_LONG:
9088 case T_DOUBLE:
9089 evpermq(dst, mask, nds, src, merge, vector_len); break;
9090 default:
9091 fatal("Unexpected type argument %s", type2name(type)); break;
9092 }
9093 }
9094
9095 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9096 switch(type) {
9097 case T_BYTE:
9098 evpminub(dst, mask, nds, src, merge, vector_len); break;
9099 case T_SHORT:
9100 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9101 case T_INT:
9102 evpminud(dst, mask, nds, src, merge, vector_len); break;
9103 case T_LONG:
9104 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9105 default:
9106 fatal("Unexpected type argument %s", type2name(type)); break;
9107 }
9108 }
9109
9110 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9111 switch(type) {
9112 case T_BYTE:
9113 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9114 case T_SHORT:
9115 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9116 case T_INT:
9117 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9118 case T_LONG:
9119 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9120 default:
9121 fatal("Unexpected type argument %s", type2name(type)); break;
9122 }
9123 }
9124
9125 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9126 switch(type) {
9127 case T_BYTE:
9128 evpminub(dst, mask, nds, src, merge, vector_len); break;
9129 case T_SHORT:
9130 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9131 case T_INT:
9132 evpminud(dst, mask, nds, src, merge, vector_len); break;
9133 case T_LONG:
9134 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9135 default:
9136 fatal("Unexpected type argument %s", type2name(type)); break;
9137 }
9138 }
9139
9140 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9141 switch(type) {
9142 case T_BYTE:
9143 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9144 case T_SHORT:
9145 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9146 case T_INT:
9147 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9148 case T_LONG:
9149 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9150 default:
9151 fatal("Unexpected type argument %s", type2name(type)); break;
9152 }
9153 }
9154
9155 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9156 switch(type) {
9157 case T_BYTE:
9158 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9159 case T_SHORT:
9160 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9161 case T_INT:
9162 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9163 case T_LONG:
9164 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9165 case T_FLOAT:
9166 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9167 case T_DOUBLE:
9168 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9169 default:
9170 fatal("Unexpected type argument %s", type2name(type)); break;
9171 }
9172 }
9173
9174 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9175 switch(type) {
9176 case T_BYTE:
9177 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9178 case T_SHORT:
9179 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9180 case T_INT:
9181 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9182 case T_LONG:
9183 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9184 case T_FLOAT:
9185 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9186 case T_DOUBLE:
9187 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9188 default:
9189 fatal("Unexpected type argument %s", type2name(type)); break;
9190 }
9191 }
9192
9193 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9194 switch(type) {
9195 case T_BYTE:
9196 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9197 case T_SHORT:
9198 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9199 case T_INT:
9200 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9201 case T_LONG:
9202 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9203 case T_FLOAT:
9204 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9205 case T_DOUBLE:
9206 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9207 default:
9208 fatal("Unexpected type argument %s", type2name(type)); break;
9209 }
9210 }
9211
9212 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9213 switch(type) {
9214 case T_BYTE:
9215 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9216 case T_SHORT:
9217 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9218 case T_INT:
9219 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9220 case T_LONG:
9221 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9222 case T_FLOAT:
9223 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9224 case T_DOUBLE:
9225 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9226 default:
9227 fatal("Unexpected type argument %s", type2name(type)); break;
9228 }
9229 }
9230
9231 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9232 switch(type) {
9233 case T_INT:
9234 evpxord(dst, mask, nds, src, merge, vector_len); break;
9235 case T_LONG:
9236 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9237 default:
9238 fatal("Unexpected type argument %s", type2name(type)); break;
9239 }
9240 }
9241
9242 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9243 switch(type) {
9244 case T_INT:
9245 evpxord(dst, mask, nds, src, merge, vector_len); break;
9246 case T_LONG:
9247 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9248 default:
9249 fatal("Unexpected type argument %s", type2name(type)); break;
9250 }
9251 }
9252
9253 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9254 switch(type) {
9255 case T_INT:
9256 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9257 case T_LONG:
9258 evporq(dst, mask, nds, src, merge, vector_len); break;
9259 default:
9260 fatal("Unexpected type argument %s", type2name(type)); break;
9261 }
9262 }
9263
9264 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9265 switch(type) {
9266 case T_INT:
9267 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9268 case T_LONG:
9269 evporq(dst, mask, nds, src, merge, vector_len); break;
9270 default:
9271 fatal("Unexpected type argument %s", type2name(type)); break;
9272 }
9273 }
9274
9275 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9276 switch(type) {
9277 case T_INT:
9278 evpandd(dst, mask, nds, src, merge, vector_len); break;
9279 case T_LONG:
9280 evpandq(dst, mask, nds, src, merge, vector_len); break;
9281 default:
9282 fatal("Unexpected type argument %s", type2name(type)); break;
9283 }
9284 }
9285
9286 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9287 switch(type) {
9288 case T_INT:
9289 evpandd(dst, mask, nds, src, merge, vector_len); break;
9290 case T_LONG:
9291 evpandq(dst, mask, nds, src, merge, vector_len); break;
9292 default:
9293 fatal("Unexpected type argument %s", type2name(type)); break;
9294 }
9295 }
9296
9297 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
9298 switch(masklen) {
9299 case 8:
9300 kortestbl(src1, src2);
9301 break;
9302 case 16:
9303 kortestwl(src1, src2);
9304 break;
9305 case 32:
9306 kortestdl(src1, src2);
9307 break;
9308 case 64:
9309 kortestql(src1, src2);
9310 break;
9311 default:
9312 fatal("Unexpected mask length %d", masklen);
9313 break;
9314 }
9315 }
9316
9317
9318 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9319 switch(masklen) {
9320 case 8:
9321 ktestbl(src1, src2);
9322 break;
9323 case 16:
9324 ktestwl(src1, src2);
9325 break;
9326 case 32:
9327 ktestdl(src1, src2);
9328 break;
9329 case 64:
9330 ktestql(src1, src2);
9331 break;
9332 default:
9333 fatal("Unexpected mask length %d", masklen);
9334 break;
9335 }
9336 }
9337
9338 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9339 switch(type) {
9340 case T_INT:
9341 evprold(dst, mask, src, shift, merge, vlen_enc); break;
9342 case T_LONG:
9343 evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9344 default:
9345 fatal("Unexpected type argument %s", type2name(type)); break;
9346 break;
9347 }
9348 }
9349
9350 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9351 switch(type) {
9352 case T_INT:
9353 evprord(dst, mask, src, shift, merge, vlen_enc); break;
9354 case T_LONG:
9355 evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9356 default:
9357 fatal("Unexpected type argument %s", type2name(type)); break;
9358 }
9359 }
9360
9361 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9362 switch(type) {
9363 case T_INT:
9364 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9365 case T_LONG:
9366 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9367 default:
9368 fatal("Unexpected type argument %s", type2name(type)); break;
9369 }
9370 }
9371
9372 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9373 switch(type) {
9374 case T_INT:
9375 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9376 case T_LONG:
9377 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9378 default:
9379 fatal("Unexpected type argument %s", type2name(type)); break;
9380 }
9381 }
9382
9383 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9384 assert(rscratch != noreg || always_reachable(src), "missing");
9385
9386 if (reachable(src)) {
9387 evpandq(dst, nds, as_Address(src), vector_len);
9388 } else {
9389 lea(rscratch, src);
9390 evpandq(dst, nds, Address(rscratch, 0), vector_len);
9391 }
9392 }
9393
9394 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9395 assert(rscratch != noreg || always_reachable(src), "missing");
9396
9397 if (reachable(src)) {
9398 Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9399 } else {
9400 lea(rscratch, src);
9401 Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9402 }
9403 }
9404
9405 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9406 assert(rscratch != noreg || always_reachable(src), "missing");
9407
9408 if (reachable(src)) {
9409 evporq(dst, nds, as_Address(src), vector_len);
9410 } else {
9411 lea(rscratch, src);
9412 evporq(dst, nds, Address(rscratch, 0), vector_len);
9413 }
9414 }
9415
9416 void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9417 assert(rscratch != noreg || always_reachable(src), "missing");
9418
9419 if (reachable(src)) {
9420 vpshufb(dst, nds, as_Address(src), vector_len);
9421 } else {
9422 lea(rscratch, src);
9423 vpshufb(dst, nds, Address(rscratch, 0), vector_len);
9424 }
9425 }
9426
9427 void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9428 assert(rscratch != noreg || always_reachable(src), "missing");
9429
9430 if (reachable(src)) {
9431 Assembler::vpor(dst, nds, as_Address(src), vector_len);
9432 } else {
9433 lea(rscratch, src);
9434 Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len);
9435 }
9436 }
9437
9438 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
9439 assert(rscratch != noreg || always_reachable(src3), "missing");
9440
9441 if (reachable(src3)) {
9442 vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
9443 } else {
9444 lea(rscratch, src3);
9445 vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
9446 }
9447 }
9448
9449 #if COMPILER2_OR_JVMCI
9450
9451 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9452 Register length, Register temp, int vec_enc) {
9453 // Computing mask for predicated vector store.
9454 movptr(temp, -1);
9455 bzhiq(temp, temp, length);
9456 kmov(mask, temp);
9457 evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9458 }
9459
9460 // Set memory operation for length "less than" 64 bytes.
9461 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9462 XMMRegister xmm, KRegister mask, Register length,
9463 Register temp, bool use64byteVector) {
9464 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9465 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9466 if (!use64byteVector) {
9467 fill32(dst, disp, xmm);
9468 subptr(length, 32 >> shift);
9469 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9470 } else {
9471 assert(MaxVectorSize == 64, "vector length != 64");
9472 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9473 }
9474 }
9475
9476
9477 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9478 XMMRegister xmm, KRegister mask, Register length,
9479 Register temp) {
9480 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9481 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9482 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9483 }
9484
9485
9486 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9487 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9488 vmovdqu(dst, xmm);
9489 }
9490
9491 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9492 fill32(Address(dst, disp), xmm);
9493 }
9494
9495 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9496 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9497 if (!use64byteVector) {
9498 fill32(dst, xmm);
9499 fill32(dst.plus_disp(32), xmm);
9500 } else {
9501 evmovdquq(dst, xmm, Assembler::AVX_512bit);
9502 }
9503 }
9504
9505 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9506 fill64(Address(dst, disp), xmm, use64byteVector);
9507 }
9508
9509 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9510 Register count, Register rtmp, XMMRegister xtmp) {
9511 Label L_exit;
9512 Label L_fill_start;
9513 Label L_fill_64_bytes;
9514 Label L_fill_96_bytes;
9515 Label L_fill_128_bytes;
9516 Label L_fill_128_bytes_loop;
9517 Label L_fill_128_loop_header;
9518 Label L_fill_128_bytes_loop_header;
9519 Label L_fill_128_bytes_loop_pre_header;
9520 Label L_fill_zmm_sequence;
9521
9522 int shift = -1;
9523 switch(type) {
9524 case T_BYTE: shift = 0;
9525 break;
9526 case T_SHORT: shift = 1;
9527 break;
9528 case T_INT: shift = 2;
9529 break;
9530 /* Uncomment when LONG fill stubs are supported.
9531 case T_LONG: shift = 3;
9532 break;
9533 */
9534 default:
9535 fatal("Unhandled type: %s\n", type2name(type));
9536 }
9537
9538 if ((CopyAVX3Threshold != 0) || (MaxVectorSize == 32)) {
9539
9540 if (MaxVectorSize == 64) {
9541 cmpq(count, CopyAVX3Threshold >> shift);
9542 jcc(Assembler::greater, L_fill_zmm_sequence);
9543 }
9544
9545 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
9546
9547 bind(L_fill_start);
9548
9549 cmpq(count, 32 >> shift);
9550 jccb(Assembler::greater, L_fill_64_bytes);
9551 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
9552 jmp(L_exit);
9553
9554 bind(L_fill_64_bytes);
9555 cmpq(count, 64 >> shift);
9556 jccb(Assembler::greater, L_fill_96_bytes);
9557 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
9558 jmp(L_exit);
9559
9560 bind(L_fill_96_bytes);
9561 cmpq(count, 96 >> shift);
9562 jccb(Assembler::greater, L_fill_128_bytes);
9563 fill64(to, 0, xtmp);
9564 subq(count, 64 >> shift);
9565 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
9566 jmp(L_exit);
9567
9568 bind(L_fill_128_bytes);
9569 cmpq(count, 128 >> shift);
9570 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
9571 fill64(to, 0, xtmp);
9572 fill32(to, 64, xtmp);
9573 subq(count, 96 >> shift);
9574 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
9575 jmp(L_exit);
9576
9577 bind(L_fill_128_bytes_loop_pre_header);
9578 {
9579 mov(rtmp, to);
9580 andq(rtmp, 31);
9581 jccb(Assembler::zero, L_fill_128_bytes_loop_header);
9582 negq(rtmp);
9583 addq(rtmp, 32);
9584 mov64(r8, -1L);
9585 bzhiq(r8, r8, rtmp);
9586 kmovql(k2, r8);
9587 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
9588 addq(to, rtmp);
9589 shrq(rtmp, shift);
9590 subq(count, rtmp);
9591 }
9592
9593 cmpq(count, 128 >> shift);
9594 jcc(Assembler::less, L_fill_start);
9595
9596 bind(L_fill_128_bytes_loop_header);
9597 subq(count, 128 >> shift);
9598
9599 align32();
9600 bind(L_fill_128_bytes_loop);
9601 fill64(to, 0, xtmp);
9602 fill64(to, 64, xtmp);
9603 addq(to, 128);
9604 subq(count, 128 >> shift);
9605 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9606
9607 addq(count, 128 >> shift);
9608 jcc(Assembler::zero, L_exit);
9609 jmp(L_fill_start);
9610 }
9611
9612 if (MaxVectorSize == 64) {
9613 // Sequence using 64 byte ZMM register.
9614 Label L_fill_128_bytes_zmm;
9615 Label L_fill_192_bytes_zmm;
9616 Label L_fill_192_bytes_loop_zmm;
9617 Label L_fill_192_bytes_loop_header_zmm;
9618 Label L_fill_192_bytes_loop_pre_header_zmm;
9619 Label L_fill_start_zmm_sequence;
9620
9621 bind(L_fill_zmm_sequence);
9622 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9623
9624 bind(L_fill_start_zmm_sequence);
9625 cmpq(count, 64 >> shift);
9626 jccb(Assembler::greater, L_fill_128_bytes_zmm);
9627 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9628 jmp(L_exit);
9629
9630 bind(L_fill_128_bytes_zmm);
9631 cmpq(count, 128 >> shift);
9632 jccb(Assembler::greater, L_fill_192_bytes_zmm);
9633 fill64(to, 0, xtmp, true);
9634 subq(count, 64 >> shift);
9635 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9636 jmp(L_exit);
9637
9638 bind(L_fill_192_bytes_zmm);
9639 cmpq(count, 192 >> shift);
9640 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9641 fill64(to, 0, xtmp, true);
9642 fill64(to, 64, xtmp, true);
9643 subq(count, 128 >> shift);
9644 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9645 jmp(L_exit);
9646
9647 bind(L_fill_192_bytes_loop_pre_header_zmm);
9648 {
9649 movq(rtmp, to);
9650 andq(rtmp, 63);
9651 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9652 negq(rtmp);
9653 addq(rtmp, 64);
9654 mov64(r8, -1L);
9655 bzhiq(r8, r8, rtmp);
9656 kmovql(k2, r8);
9657 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9658 addq(to, rtmp);
9659 shrq(rtmp, shift);
9660 subq(count, rtmp);
9661 }
9662
9663 cmpq(count, 192 >> shift);
9664 jcc(Assembler::less, L_fill_start_zmm_sequence);
9665
9666 bind(L_fill_192_bytes_loop_header_zmm);
9667 subq(count, 192 >> shift);
9668
9669 align32();
9670 bind(L_fill_192_bytes_loop_zmm);
9671 fill64(to, 0, xtmp, true);
9672 fill64(to, 64, xtmp, true);
9673 fill64(to, 128, xtmp, true);
9674 addq(to, 192);
9675 subq(count, 192 >> shift);
9676 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9677
9678 addq(count, 192 >> shift);
9679 jcc(Assembler::zero, L_exit);
9680 jmp(L_fill_start_zmm_sequence);
9681 }
9682 bind(L_exit);
9683 }
9684 #endif //COMPILER2_OR_JVMCI
9685
9686
9687 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9688 Label done;
9689 cvttss2sil(dst, src);
9690 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9691 cmpl(dst, 0x80000000); // float_sign_flip
9692 jccb(Assembler::notEqual, done);
9693 subptr(rsp, 8);
9694 movflt(Address(rsp, 0), src);
9695 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9696 pop(dst);
9697 bind(done);
9698 }
9699
9700 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9701 Label done;
9702 cvttsd2sil(dst, src);
9703 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9704 cmpl(dst, 0x80000000); // float_sign_flip
9705 jccb(Assembler::notEqual, done);
9706 subptr(rsp, 8);
9707 movdbl(Address(rsp, 0), src);
9708 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9709 pop(dst);
9710 bind(done);
9711 }
9712
9713 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9714 Label done;
9715 cvttss2siq(dst, src);
9716 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9717 jccb(Assembler::notEqual, done);
9718 subptr(rsp, 8);
9719 movflt(Address(rsp, 0), src);
9720 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9721 pop(dst);
9722 bind(done);
9723 }
9724
9725 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9726 // Following code is line by line assembly translation rounding algorithm.
9727 // Please refer to java.lang.Math.round(float) algorithm for details.
9728 const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
9729 const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
9730 const int32_t FloatConsts_EXP_BIAS = 127;
9731 const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
9732 const int32_t MINUS_32 = 0xFFFFFFE0;
9733 Label L_special_case, L_block1, L_exit;
9734 movl(rtmp, FloatConsts_EXP_BIT_MASK);
9735 movdl(dst, src);
9736 andl(dst, rtmp);
9737 sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
9738 movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
9739 subl(rtmp, dst);
9740 movl(rcx, rtmp);
9741 movl(dst, MINUS_32);
9742 testl(rtmp, dst);
9743 jccb(Assembler::notEqual, L_special_case);
9744 movdl(dst, src);
9745 andl(dst, FloatConsts_SIGNIF_BIT_MASK);
9746 orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
9747 movdl(rtmp, src);
9748 testl(rtmp, rtmp);
9749 jccb(Assembler::greaterEqual, L_block1);
9750 negl(dst);
9751 bind(L_block1);
9752 sarl(dst);
9753 addl(dst, 0x1);
9754 sarl(dst, 0x1);
9755 jmp(L_exit);
9756 bind(L_special_case);
9757 convert_f2i(dst, src);
9758 bind(L_exit);
9759 }
9760
9761 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9762 // Following code is line by line assembly translation rounding algorithm.
9763 // Please refer to java.lang.Math.round(double) algorithm for details.
9764 const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
9765 const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
9766 const int64_t DoubleConsts_EXP_BIAS = 1023;
9767 const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
9768 const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
9769 Label L_special_case, L_block1, L_exit;
9770 mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
9771 movq(dst, src);
9772 andq(dst, rtmp);
9773 sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
9774 mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
9775 subq(rtmp, dst);
9776 movq(rcx, rtmp);
9777 mov64(dst, MINUS_64);
9778 testq(rtmp, dst);
9779 jccb(Assembler::notEqual, L_special_case);
9780 movq(dst, src);
9781 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
9782 andq(dst, rtmp);
9783 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
9784 orq(dst, rtmp);
9785 movq(rtmp, src);
9786 testq(rtmp, rtmp);
9787 jccb(Assembler::greaterEqual, L_block1);
9788 negq(dst);
9789 bind(L_block1);
9790 sarq(dst);
9791 addq(dst, 0x1);
9792 sarq(dst, 0x1);
9793 jmp(L_exit);
9794 bind(L_special_case);
9795 convert_d2l(dst, src);
9796 bind(L_exit);
9797 }
9798
9799 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9800 Label done;
9801 cvttsd2siq(dst, src);
9802 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9803 jccb(Assembler::notEqual, done);
9804 subptr(rsp, 8);
9805 movdbl(Address(rsp, 0), src);
9806 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9807 pop(dst);
9808 bind(done);
9809 }
9810
9811 void MacroAssembler::cache_wb(Address line)
9812 {
9813 // 64 bit cpus always support clflush
9814 assert(VM_Version::supports_clflush(), "clflush should be available");
9815 bool optimized = VM_Version::supports_clflushopt();
9816 bool no_evict = VM_Version::supports_clwb();
9817
9818 // prefer clwb (writeback without evict) otherwise
9819 // prefer clflushopt (potentially parallel writeback with evict)
9820 // otherwise fallback on clflush (serial writeback with evict)
9821
9822 if (optimized) {
9823 if (no_evict) {
9824 clwb(line);
9825 } else {
9826 clflushopt(line);
9827 }
9828 } else {
9829 // no need for fence when using CLFLUSH
9830 clflush(line);
9831 }
9832 }
9833
9834 void MacroAssembler::cache_wbsync(bool is_pre)
9835 {
9836 assert(VM_Version::supports_clflush(), "clflush should be available");
9837 bool optimized = VM_Version::supports_clflushopt();
9838 bool no_evict = VM_Version::supports_clwb();
9839
9840 // pick the correct implementation
9841
9842 if (!is_pre && (optimized || no_evict)) {
9843 // need an sfence for post flush when using clflushopt or clwb
9844 // otherwise no no need for any synchroniaztion
9845
9846 sfence();
9847 }
9848 }
9849
9850 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9851 switch (cond) {
9852 // Note some conditions are synonyms for others
9853 case Assembler::zero: return Assembler::notZero;
9854 case Assembler::notZero: return Assembler::zero;
9855 case Assembler::less: return Assembler::greaterEqual;
9856 case Assembler::lessEqual: return Assembler::greater;
9857 case Assembler::greater: return Assembler::lessEqual;
9858 case Assembler::greaterEqual: return Assembler::less;
9859 case Assembler::below: return Assembler::aboveEqual;
9860 case Assembler::belowEqual: return Assembler::above;
9861 case Assembler::above: return Assembler::belowEqual;
9862 case Assembler::aboveEqual: return Assembler::below;
9863 case Assembler::overflow: return Assembler::noOverflow;
9864 case Assembler::noOverflow: return Assembler::overflow;
9865 case Assembler::negative: return Assembler::positive;
9866 case Assembler::positive: return Assembler::negative;
9867 case Assembler::parity: return Assembler::noParity;
9868 case Assembler::noParity: return Assembler::parity;
9869 }
9870 ShouldNotReachHere(); return Assembler::overflow;
9871 }
9872
9873 // This is simply a call to Thread::current()
9874 void MacroAssembler::get_thread_slow(Register thread) {
9875 if (thread != rax) {
9876 push(rax);
9877 }
9878 push(rdi);
9879 push(rsi);
9880 push(rdx);
9881 push(rcx);
9882 push(r8);
9883 push(r9);
9884 push(r10);
9885 push(r11);
9886
9887 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9888
9889 pop(r11);
9890 pop(r10);
9891 pop(r9);
9892 pop(r8);
9893 pop(rcx);
9894 pop(rdx);
9895 pop(rsi);
9896 pop(rdi);
9897 if (thread != rax) {
9898 mov(thread, rax);
9899 pop(rax);
9900 }
9901 }
9902
9903 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
9904 Label L_stack_ok;
9905 if (bias == 0) {
9906 testptr(sp, 2 * wordSize - 1);
9907 } else {
9908 // lea(tmp, Address(rsp, bias);
9909 mov(tmp, sp);
9910 addptr(tmp, bias);
9911 testptr(tmp, 2 * wordSize - 1);
9912 }
9913 jcc(Assembler::equal, L_stack_ok);
9914 block_comment(msg);
9915 stop(msg);
9916 bind(L_stack_ok);
9917 }
9918
9919 // Implements fast-locking.
9920 //
9921 // obj: the object to be locked
9922 // reg_rax: rax
9923 // thread: the thread which attempts to lock obj
9924 // tmp: a temporary register
9925 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register reg_rax, Register tmp, Label& slow) {
9926 Register thread = r15_thread;
9927
9928 assert(reg_rax == rax, "");
9929 assert_different_registers(basic_lock, obj, reg_rax, thread, tmp);
9930
9931 Label push;
9932 const Register top = tmp;
9933
9934 // Preload the markWord. It is important that this is the first
9935 // instruction emitted as it is part of C1's null check semantics.
9936 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
9937
9938 if (UseObjectMonitorTable) {
9939 // Clear cache in case fast locking succeeds or we need to take the slow-path.
9940 movptr(Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))), 0);
9941 }
9942
9943 if (DiagnoseSyncOnValueBasedClasses != 0) {
9944 load_klass(tmp, obj, rscratch1);
9945 testb(Address(tmp, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
9946 jcc(Assembler::notZero, slow);
9947 }
9948
9949 // Load top.
9950 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9951
9952 // Check if the lock-stack is full.
9953 cmpl(top, LockStack::end_offset());
9954 jcc(Assembler::greaterEqual, slow);
9955
9956 // Check for recursion.
9957 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
9958 jcc(Assembler::equal, push);
9959
9960 // Check header for monitor (0b10).
9961 testptr(reg_rax, markWord::monitor_value);
9962 jcc(Assembler::notZero, slow);
9963
9964 // Try to lock. Transition lock bits 0b01 => 0b00
9965 movptr(tmp, reg_rax);
9966 andptr(tmp, ~(int32_t)markWord::unlocked_value);
9967 orptr(reg_rax, markWord::unlocked_value);
9968 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
9969 jcc(Assembler::notEqual, slow);
9970
9971 // Restore top, CAS clobbers register.
9972 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9973
9974 bind(push);
9975 // After successful lock, push object on lock-stack.
9976 movptr(Address(thread, top), obj);
9977 incrementl(top, oopSize);
9978 movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
9979 }
9980
9981 // Implements fast-unlocking.
9982 //
9983 // obj: the object to be unlocked
9984 // reg_rax: rax
9985 // thread: the thread
9986 // tmp: a temporary register
9987 void MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register tmp, Label& slow) {
9988 Register thread = r15_thread;
9989
9990 assert(reg_rax == rax, "");
9991 assert_different_registers(obj, reg_rax, thread, tmp);
9992
9993 Label unlocked, push_and_slow;
9994 const Register top = tmp;
9995
9996 // Check if obj is top of lock-stack.
9997 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9998 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
9999 jcc(Assembler::notEqual, slow);
10000
10001 // Pop lock-stack.
10002 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
10003 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10004
10005 // Check if recursive.
10006 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
10007 jcc(Assembler::equal, unlocked);
10008
10009 // Not recursive. Check header for monitor (0b10).
10010 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10011 testptr(reg_rax, markWord::monitor_value);
10012 jcc(Assembler::notZero, push_and_slow);
10013
10014 #ifdef ASSERT
10015 // Check header not unlocked (0b01).
10016 Label not_unlocked;
10017 testptr(reg_rax, markWord::unlocked_value);
10018 jcc(Assembler::zero, not_unlocked);
10019 stop("fast_unlock already unlocked");
10020 bind(not_unlocked);
10021 #endif
10022
10023 // Try to unlock. Transition lock bits 0b00 => 0b01
10024 movptr(tmp, reg_rax);
10025 orptr(tmp, markWord::unlocked_value);
10026 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10027 jcc(Assembler::equal, unlocked);
10028
10029 bind(push_and_slow);
10030 // Restore lock-stack and handle the unlock in runtime.
10031 #ifdef ASSERT
10032 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10033 movptr(Address(thread, top), obj);
10034 #endif
10035 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10036 jmp(slow);
10037
10038 bind(unlocked);
10039 }
10040
10041 // Saves legacy GPRs state on stack.
10042 void MacroAssembler::save_legacy_gprs() {
10043 subq(rsp, 16 * wordSize);
10044 movq(Address(rsp, 15 * wordSize), rax);
10045 movq(Address(rsp, 14 * wordSize), rcx);
10046 movq(Address(rsp, 13 * wordSize), rdx);
10047 movq(Address(rsp, 12 * wordSize), rbx);
10048 movq(Address(rsp, 10 * wordSize), rbp);
10049 movq(Address(rsp, 9 * wordSize), rsi);
10050 movq(Address(rsp, 8 * wordSize), rdi);
10051 movq(Address(rsp, 7 * wordSize), r8);
10052 movq(Address(rsp, 6 * wordSize), r9);
10053 movq(Address(rsp, 5 * wordSize), r10);
10054 movq(Address(rsp, 4 * wordSize), r11);
10055 movq(Address(rsp, 3 * wordSize), r12);
10056 movq(Address(rsp, 2 * wordSize), r13);
10057 movq(Address(rsp, wordSize), r14);
10058 movq(Address(rsp, 0), r15);
10059 }
10060
10061 // Resotres back legacy GPRs state from stack.
10062 void MacroAssembler::restore_legacy_gprs() {
10063 movq(r15, Address(rsp, 0));
10064 movq(r14, Address(rsp, wordSize));
10065 movq(r13, Address(rsp, 2 * wordSize));
10066 movq(r12, Address(rsp, 3 * wordSize));
10067 movq(r11, Address(rsp, 4 * wordSize));
10068 movq(r10, Address(rsp, 5 * wordSize));
10069 movq(r9, Address(rsp, 6 * wordSize));
10070 movq(r8, Address(rsp, 7 * wordSize));
10071 movq(rdi, Address(rsp, 8 * wordSize));
10072 movq(rsi, Address(rsp, 9 * wordSize));
10073 movq(rbp, Address(rsp, 10 * wordSize));
10074 movq(rbx, Address(rsp, 12 * wordSize));
10075 movq(rdx, Address(rsp, 13 * wordSize));
10076 movq(rcx, Address(rsp, 14 * wordSize));
10077 movq(rax, Address(rsp, 15 * wordSize));
10078 addq(rsp, 16 * wordSize);
10079 }
10080
10081 void MacroAssembler::load_aotrc_address(Register reg, address a) {
10082 #if INCLUDE_CDS
10083 assert(AOTRuntimeConstants::contains(a), "address out of range for data area");
10084 if (AOTCodeCache::is_on_for_dump()) {
10085 // all aotrc field addresses should be registered in the AOTCodeCache address table
10086 lea(reg, ExternalAddress(a));
10087 } else {
10088 mov64(reg, (uint64_t)a);
10089 }
10090 #else
10091 ShouldNotReachHere();
10092 #endif
10093 }
10094
10095 void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) {
10096 if (VM_Version::supports_apx_f()) {
10097 esetzucc(comparison, dst);
10098 } else {
10099 setb(comparison, dst);
10100 movzbl(dst, dst);
10101 }
10102 }
--- EOF ---