1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "code/aotCodeCache.hpp"
28 #include "code/compiledIC.hpp"
29 #include "compiler/compiler_globals.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "crc32c.h"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/barrierSetAssembler.hpp"
34 #include "gc/shared/collectedHeap.inline.hpp"
35 #include "gc/shared/tlab_globals.hpp"
36 #include "interpreter/bytecodeHistogram.hpp"
37 #include "interpreter/interpreter.hpp"
38 #include "interpreter/interpreterRuntime.hpp"
39 #include "jvm.h"
40 #include "memory/resourceArea.hpp"
41 #include "memory/universe.hpp"
42 #include "oops/accessDecorators.hpp"
43 #include "oops/compressedKlass.inline.hpp"
44 #include "oops/compressedOops.inline.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/interfaceSupport.inline.hpp"
49 #include "runtime/javaThread.hpp"
50 #include "runtime/jniHandles.hpp"
51 #include "runtime/objectMonitor.hpp"
52 #include "runtime/os.hpp"
53 #include "runtime/safepoint.hpp"
54 #include "runtime/safepointMechanism.hpp"
55 #include "runtime/sharedRuntime.hpp"
56 #include "runtime/stubRoutines.hpp"
57 #include "utilities/checkedCast.hpp"
58 #include "utilities/globalDefinitions.hpp"
59 #include "utilities/macros.hpp"
60
61 #ifdef PRODUCT
62 #define BLOCK_COMMENT(str) /* nothing */
63 #define STOP(error) stop(error)
64 #else
65 #define BLOCK_COMMENT(str) block_comment(str)
66 #define STOP(error) block_comment(error); stop(error)
67 #endif
68
69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
70
71 #ifdef ASSERT
72 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
73 #endif
74
75 static const Assembler::Condition reverse[] = {
76 Assembler::noOverflow /* overflow = 0x0 */ ,
77 Assembler::overflow /* noOverflow = 0x1 */ ,
78 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
79 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
80 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
81 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
82 Assembler::above /* belowEqual = 0x6 */ ,
83 Assembler::belowEqual /* above = 0x7 */ ,
84 Assembler::positive /* negative = 0x8 */ ,
85 Assembler::negative /* positive = 0x9 */ ,
86 Assembler::noParity /* parity = 0xa */ ,
87 Assembler::parity /* noParity = 0xb */ ,
88 Assembler::greaterEqual /* less = 0xc */ ,
89 Assembler::less /* greaterEqual = 0xd */ ,
90 Assembler::greater /* lessEqual = 0xe */ ,
91 Assembler::lessEqual /* greater = 0xf, */
92
93 };
94
95
96 // Implementation of MacroAssembler
97
98 Address MacroAssembler::as_Address(AddressLiteral adr) {
99 // amd64 always does this as a pc-rel
100 // we can be absolute or disp based on the instruction type
101 // jmp/call are displacements others are absolute
102 assert(!adr.is_lval(), "must be rval");
103 assert(reachable(adr), "must be");
104 return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
105
106 }
107
108 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
109 AddressLiteral base = adr.base();
110 lea(rscratch, base);
111 Address index = adr.index();
112 assert(index._disp == 0, "must not have disp"); // maybe it can?
113 Address array(rscratch, index._index, index._scale, index._disp);
114 return array;
115 }
116
117 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
118 Label L, E;
119
120 #ifdef _WIN64
121 // Windows always allocates space for it's register args
122 assert(num_args <= 4, "only register arguments supported");
123 subq(rsp, frame::arg_reg_save_area_bytes);
124 #endif
125
126 // Align stack if necessary
127 testl(rsp, 15);
128 jcc(Assembler::zero, L);
129
130 subq(rsp, 8);
131 call(RuntimeAddress(entry_point));
132 addq(rsp, 8);
133 jmp(E);
134
135 bind(L);
136 call(RuntimeAddress(entry_point));
137
138 bind(E);
139
140 #ifdef _WIN64
141 // restore stack pointer
142 addq(rsp, frame::arg_reg_save_area_bytes);
143 #endif
144 }
145
146 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
147 assert(!src2.is_lval(), "should use cmpptr");
148 assert(rscratch != noreg || always_reachable(src2), "missing");
149
150 if (reachable(src2)) {
151 cmpq(src1, as_Address(src2));
152 } else {
153 lea(rscratch, src2);
154 Assembler::cmpq(src1, Address(rscratch, 0));
155 }
156 }
157
158 int MacroAssembler::corrected_idivq(Register reg) {
159 // Full implementation of Java ldiv and lrem; checks for special
160 // case as described in JVM spec., p.243 & p.271. The function
161 // returns the (pc) offset of the idivl instruction - may be needed
162 // for implicit exceptions.
163 //
164 // normal case special case
165 //
166 // input : rax: dividend min_long
167 // reg: divisor (may not be eax/edx) -1
168 //
169 // output: rax: quotient (= rax idiv reg) min_long
170 // rdx: remainder (= rax irem reg) 0
171 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
172 static const int64_t min_long = 0x8000000000000000;
173 Label normal_case, special_case;
174
175 // check for special case
176 cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
177 jcc(Assembler::notEqual, normal_case);
178 xorl(rdx, rdx); // prepare rdx for possible special case (where
179 // remainder = 0)
180 cmpq(reg, -1);
181 jcc(Assembler::equal, special_case);
182
183 // handle normal case
184 bind(normal_case);
185 cdqq();
186 int idivq_offset = offset();
187 idivq(reg);
188
189 // normal and special case exit
190 bind(special_case);
191
192 return idivq_offset;
193 }
194
195 void MacroAssembler::decrementq(Register reg, int value) {
196 if (value == min_jint) { subq(reg, value); return; }
197 if (value < 0) { incrementq(reg, -value); return; }
198 if (value == 0) { ; return; }
199 if (value == 1 && UseIncDec) { decq(reg) ; return; }
200 /* else */ { subq(reg, value) ; return; }
201 }
202
203 void MacroAssembler::decrementq(Address dst, int value) {
204 if (value == min_jint) { subq(dst, value); return; }
205 if (value < 0) { incrementq(dst, -value); return; }
206 if (value == 0) { ; return; }
207 if (value == 1 && UseIncDec) { decq(dst) ; return; }
208 /* else */ { subq(dst, value) ; return; }
209 }
210
211 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
212 assert(rscratch != noreg || always_reachable(dst), "missing");
213
214 if (reachable(dst)) {
215 incrementq(as_Address(dst));
216 } else {
217 lea(rscratch, dst);
218 incrementq(Address(rscratch, 0));
219 }
220 }
221
222 void MacroAssembler::incrementq(Register reg, int value) {
223 if (value == min_jint) { addq(reg, value); return; }
224 if (value < 0) { decrementq(reg, -value); return; }
225 if (value == 0) { ; return; }
226 if (value == 1 && UseIncDec) { incq(reg) ; return; }
227 /* else */ { addq(reg, value) ; return; }
228 }
229
230 void MacroAssembler::incrementq(Address dst, int value) {
231 if (value == min_jint) { addq(dst, value); return; }
232 if (value < 0) { decrementq(dst, -value); return; }
233 if (value == 0) { ; return; }
234 if (value == 1 && UseIncDec) { incq(dst) ; return; }
235 /* else */ { addq(dst, value) ; return; }
236 }
237
238 // 32bit can do a case table jump in one instruction but we no longer allow the base
239 // to be installed in the Address class
240 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
241 lea(rscratch, entry.base());
242 Address dispatch = entry.index();
243 assert(dispatch._base == noreg, "must be");
244 dispatch._base = rscratch;
245 jmp(dispatch);
246 }
247
248 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
249 ShouldNotReachHere(); // 64bit doesn't use two regs
250 cmpq(x_lo, y_lo);
251 }
252
253 void MacroAssembler::lea(Register dst, AddressLiteral src) {
254 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
255 }
256
257 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
258 lea(rscratch, adr);
259 movptr(dst, rscratch);
260 }
261
262 void MacroAssembler::leave() {
263 // %%% is this really better? Why not on 32bit too?
264 emit_int8((unsigned char)0xC9); // LEAVE
265 }
266
267 void MacroAssembler::lneg(Register hi, Register lo) {
268 ShouldNotReachHere(); // 64bit doesn't use two regs
269 negq(lo);
270 }
271
272 void MacroAssembler::movoop(Register dst, jobject obj) {
273 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
274 }
275
276 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
277 mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
278 movq(dst, rscratch);
279 }
280
281 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
282 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
283 }
284
285 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
286 mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
287 movq(dst, rscratch);
288 }
289
290 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
291 if (src.is_lval()) {
292 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
293 } else {
294 if (reachable(src)) {
295 movq(dst, as_Address(src));
296 } else {
297 lea(dst, src);
298 movq(dst, Address(dst, 0));
299 }
300 }
301 }
302
303 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
304 movq(as_Address(dst, rscratch), src);
305 }
306
307 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
308 movq(dst, as_Address(src, dst /*rscratch*/));
309 }
310
311 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
312 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
313 if (is_simm32(src)) {
314 movptr(dst, checked_cast<int32_t>(src));
315 } else {
316 mov64(rscratch, src);
317 movq(dst, rscratch);
318 }
319 }
320
321 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
322 movoop(rscratch, obj);
323 push(rscratch);
324 }
325
326 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
327 mov_metadata(rscratch, obj);
328 push(rscratch);
329 }
330
331 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
332 lea(rscratch, src);
333 if (src.is_lval()) {
334 push(rscratch);
335 } else {
336 pushq(Address(rscratch, 0));
337 }
338 }
339
340 static void pass_arg0(MacroAssembler* masm, Register arg) {
341 if (c_rarg0 != arg ) {
342 masm->mov(c_rarg0, arg);
343 }
344 }
345
346 static void pass_arg1(MacroAssembler* masm, Register arg) {
347 if (c_rarg1 != arg ) {
348 masm->mov(c_rarg1, arg);
349 }
350 }
351
352 static void pass_arg2(MacroAssembler* masm, Register arg) {
353 if (c_rarg2 != arg ) {
354 masm->mov(c_rarg2, arg);
355 }
356 }
357
358 static void pass_arg3(MacroAssembler* masm, Register arg) {
359 if (c_rarg3 != arg ) {
360 masm->mov(c_rarg3, arg);
361 }
362 }
363
364 void MacroAssembler::stop(const char* msg) {
365 if (ShowMessageBoxOnError) {
366 address rip = pc();
367 pusha(); // get regs on stack
368 lea(c_rarg1, InternalAddress(rip));
369 movq(c_rarg2, rsp); // pass pointer to regs array
370 }
371 // Skip AOT caching C strings in scratch buffer.
372 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
373 lea(c_rarg0, ExternalAddress((address) str));
374 andq(rsp, -16); // align stack as required by ABI
375 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
376 hlt();
377 }
378
379 void MacroAssembler::warn(const char* msg) {
380 push(rbp);
381 movq(rbp, rsp);
382 andq(rsp, -16); // align stack as required by push_CPU_state and call
383 push_CPU_state(); // keeps alignment at 16 bytes
384
385 #ifdef _WIN64
386 // Windows always allocates space for its register args
387 subq(rsp, frame::arg_reg_save_area_bytes);
388 #endif
389 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
390 lea(c_rarg0, ExternalAddress((address) str));
391 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
392
393 #ifdef _WIN64
394 // restore stack pointer
395 addq(rsp, frame::arg_reg_save_area_bytes);
396 #endif
397 pop_CPU_state();
398 mov(rsp, rbp);
399 pop(rbp);
400 }
401
402 void MacroAssembler::print_state() {
403 address rip = pc();
404 pusha(); // get regs on stack
405 push(rbp);
406 movq(rbp, rsp);
407 andq(rsp, -16); // align stack as required by push_CPU_state and call
408 push_CPU_state(); // keeps alignment at 16 bytes
409
410 lea(c_rarg0, InternalAddress(rip));
411 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
412 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
413
414 pop_CPU_state();
415 mov(rsp, rbp);
416 pop(rbp);
417 popa();
418 }
419
420 #ifndef PRODUCT
421 extern "C" void findpc(intptr_t x);
422 #endif
423
424 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
425 // In order to get locks to work, we need to fake a in_VM state
426 if (ShowMessageBoxOnError) {
427 JavaThread* thread = JavaThread::current();
428 JavaThreadState saved_state = thread->thread_state();
429 thread->set_thread_state(_thread_in_vm);
430 #ifndef PRODUCT
431 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
432 ttyLocker ttyl;
433 BytecodeCounter::print();
434 }
435 #endif
436 // To see where a verify_oop failed, get $ebx+40/X for this frame.
437 // XXX correct this offset for amd64
438 // This is the value of eip which points to where verify_oop will return.
439 if (os::message_box(msg, "Execution stopped, print registers?")) {
440 print_state64(pc, regs);
441 BREAKPOINT;
442 }
443 }
444 fatal("DEBUG MESSAGE: %s", msg);
445 }
446
447 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
448 ttyLocker ttyl;
449 DebuggingContext debugging{};
450 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
451 #ifndef PRODUCT
452 tty->cr();
453 findpc(pc);
454 tty->cr();
455 #endif
456 #define PRINT_REG(rax, value) \
457 { tty->print("%s = ", #rax); os::print_location(tty, value); }
458 PRINT_REG(rax, regs[15]);
459 PRINT_REG(rbx, regs[12]);
460 PRINT_REG(rcx, regs[14]);
461 PRINT_REG(rdx, regs[13]);
462 PRINT_REG(rdi, regs[8]);
463 PRINT_REG(rsi, regs[9]);
464 PRINT_REG(rbp, regs[10]);
465 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
466 PRINT_REG(rsp, (intptr_t)(®s[16]));
467 PRINT_REG(r8 , regs[7]);
468 PRINT_REG(r9 , regs[6]);
469 PRINT_REG(r10, regs[5]);
470 PRINT_REG(r11, regs[4]);
471 PRINT_REG(r12, regs[3]);
472 PRINT_REG(r13, regs[2]);
473 PRINT_REG(r14, regs[1]);
474 PRINT_REG(r15, regs[0]);
475 #undef PRINT_REG
476 // Print some words near the top of the stack.
477 int64_t* rsp = ®s[16];
478 int64_t* dump_sp = rsp;
479 for (int col1 = 0; col1 < 8; col1++) {
480 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
481 os::print_location(tty, *dump_sp++);
482 }
483 for (int row = 0; row < 25; row++) {
484 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
485 for (int col = 0; col < 4; col++) {
486 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
487 }
488 tty->cr();
489 }
490 // Print some instructions around pc:
491 Disassembler::decode((address)pc-64, (address)pc);
492 tty->print_cr("--------");
493 Disassembler::decode((address)pc, (address)pc+32);
494 }
495
496 // The java_calling_convention describes stack locations as ideal slots on
497 // a frame with no abi restrictions. Since we must observe abi restrictions
498 // (like the placement of the register window) the slots must be biased by
499 // the following value.
500 static int reg2offset_in(VMReg r) {
501 // Account for saved rbp and return address
502 // This should really be in_preserve_stack_slots
503 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
504 }
505
506 static int reg2offset_out(VMReg r) {
507 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
508 }
509
510 // A long move
511 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
512
513 // The calling conventions assures us that each VMregpair is either
514 // all really one physical register or adjacent stack slots.
515
516 if (src.is_single_phys_reg() ) {
517 if (dst.is_single_phys_reg()) {
518 if (dst.first() != src.first()) {
519 mov(dst.first()->as_Register(), src.first()->as_Register());
520 }
521 } else {
522 assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
523 src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
524 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
525 }
526 } else if (dst.is_single_phys_reg()) {
527 assert(src.is_single_reg(), "not a stack pair");
528 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
529 } else {
530 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
531 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
532 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
533 }
534 }
535
536 // A double move
537 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
538
539 // The calling conventions assures us that each VMregpair is either
540 // all really one physical register or adjacent stack slots.
541
542 if (src.is_single_phys_reg() ) {
543 if (dst.is_single_phys_reg()) {
544 // In theory these overlap but the ordering is such that this is likely a nop
545 if ( src.first() != dst.first()) {
546 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
547 }
548 } else {
549 assert(dst.is_single_reg(), "not a stack pair");
550 movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
551 }
552 } else if (dst.is_single_phys_reg()) {
553 assert(src.is_single_reg(), "not a stack pair");
554 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
555 } else {
556 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
557 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
558 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
559 }
560 }
561
562
563 // A float arg may have to do float reg int reg conversion
564 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
565 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
566
567 // The calling conventions assures us that each VMregpair is either
568 // all really one physical register or adjacent stack slots.
569
570 if (src.first()->is_stack()) {
571 if (dst.first()->is_stack()) {
572 movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
573 movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
574 } else {
575 // stack to reg
576 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
577 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
578 }
579 } else if (dst.first()->is_stack()) {
580 // reg to stack
581 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
582 movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
583 } else {
584 // reg to reg
585 // In theory these overlap but the ordering is such that this is likely a nop
586 if ( src.first() != dst.first()) {
587 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
588 }
589 }
590 }
591
592 // On 64 bit we will store integer like items to the stack as
593 // 64 bits items (x86_32/64 abi) even though java would only store
594 // 32bits for a parameter. On 32bit it will simply be 32 bits
595 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
596 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
597 if (src.first()->is_stack()) {
598 if (dst.first()->is_stack()) {
599 // stack to stack
600 movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
601 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
602 } else {
603 // stack to reg
604 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
605 }
606 } else if (dst.first()->is_stack()) {
607 // reg to stack
608 // Do we really have to sign extend???
609 // __ movslq(src.first()->as_Register(), src.first()->as_Register());
610 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
611 } else {
612 // Do we really have to sign extend???
613 // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
614 if (dst.first() != src.first()) {
615 movq(dst.first()->as_Register(), src.first()->as_Register());
616 }
617 }
618 }
619
620 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
621 if (src.first()->is_stack()) {
622 if (dst.first()->is_stack()) {
623 // stack to stack
624 movq(rax, Address(rbp, reg2offset_in(src.first())));
625 movq(Address(rsp, reg2offset_out(dst.first())), rax);
626 } else {
627 // stack to reg
628 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
629 }
630 } else if (dst.first()->is_stack()) {
631 // reg to stack
632 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
633 } else {
634 if (dst.first() != src.first()) {
635 movq(dst.first()->as_Register(), src.first()->as_Register());
636 }
637 }
638 }
639
640 // An oop arg. Must pass a handle not the oop itself
641 void MacroAssembler::object_move(OopMap* map,
642 int oop_handle_offset,
643 int framesize_in_slots,
644 VMRegPair src,
645 VMRegPair dst,
646 bool is_receiver,
647 int* receiver_offset) {
648
649 // must pass a handle. First figure out the location we use as a handle
650
651 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
652
653 // See if oop is null if it is we need no handle
654
655 if (src.first()->is_stack()) {
656
657 // Oop is already on the stack as an argument
658 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
659 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
660 if (is_receiver) {
661 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
662 }
663
664 cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
665 lea(rHandle, Address(rbp, reg2offset_in(src.first())));
666 // conditionally move a null
667 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
668 } else {
669
670 // Oop is in a register we must store it to the space we reserve
671 // on the stack for oop_handles and pass a handle if oop is non-null
672
673 const Register rOop = src.first()->as_Register();
674 int oop_slot;
675 if (rOop == j_rarg0)
676 oop_slot = 0;
677 else if (rOop == j_rarg1)
678 oop_slot = 1;
679 else if (rOop == j_rarg2)
680 oop_slot = 2;
681 else if (rOop == j_rarg3)
682 oop_slot = 3;
683 else if (rOop == j_rarg4)
684 oop_slot = 4;
685 else {
686 assert(rOop == j_rarg5, "wrong register");
687 oop_slot = 5;
688 }
689
690 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
691 int offset = oop_slot*VMRegImpl::stack_slot_size;
692
693 map->set_oop(VMRegImpl::stack2reg(oop_slot));
694 // Store oop in handle area, may be null
695 movptr(Address(rsp, offset), rOop);
696 if (is_receiver) {
697 *receiver_offset = offset;
698 }
699
700 cmpptr(rOop, NULL_WORD);
701 lea(rHandle, Address(rsp, offset));
702 // conditionally move a null from the handle area where it was just stored
703 cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
704 }
705
706 // If arg is on the stack then place it otherwise it is already in correct reg.
707 if (dst.first()->is_stack()) {
708 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
709 }
710 }
711
712 void MacroAssembler::addptr(Register dst, int32_t imm32) {
713 addq(dst, imm32);
714 }
715
716 void MacroAssembler::addptr(Register dst, Register src) {
717 addq(dst, src);
718 }
719
720 void MacroAssembler::addptr(Address dst, Register src) {
721 addq(dst, src);
722 }
723
724 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
725 assert(rscratch != noreg || always_reachable(src), "missing");
726
727 if (reachable(src)) {
728 Assembler::addsd(dst, as_Address(src));
729 } else {
730 lea(rscratch, src);
731 Assembler::addsd(dst, Address(rscratch, 0));
732 }
733 }
734
735 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
736 assert(rscratch != noreg || always_reachable(src), "missing");
737
738 if (reachable(src)) {
739 addss(dst, as_Address(src));
740 } else {
741 lea(rscratch, src);
742 addss(dst, Address(rscratch, 0));
743 }
744 }
745
746 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
747 assert(rscratch != noreg || always_reachable(src), "missing");
748
749 if (reachable(src)) {
750 Assembler::addpd(dst, as_Address(src));
751 } else {
752 lea(rscratch, src);
753 Assembler::addpd(dst, Address(rscratch, 0));
754 }
755 }
756
757 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only.
758 // Stub code is generated once and never copied.
759 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
760 void MacroAssembler::align64() {
761 align(64, (uint)(uintptr_t)pc());
762 }
763
764 void MacroAssembler::align32() {
765 align(32, (uint)(uintptr_t)pc());
766 }
767
768 void MacroAssembler::align(uint modulus) {
769 // 8273459: Ensure alignment is possible with current segment alignment
770 assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
771 align(modulus, offset());
772 }
773
774 void MacroAssembler::align(uint modulus, uint target) {
775 if (target % modulus != 0) {
776 nop(modulus - (target % modulus));
777 }
778 }
779
780 void MacroAssembler::push_f(XMMRegister r) {
781 subptr(rsp, wordSize);
782 movflt(Address(rsp, 0), r);
783 }
784
785 void MacroAssembler::pop_f(XMMRegister r) {
786 movflt(r, Address(rsp, 0));
787 addptr(rsp, wordSize);
788 }
789
790 void MacroAssembler::push_d(XMMRegister r) {
791 subptr(rsp, 2 * wordSize);
792 movdbl(Address(rsp, 0), r);
793 }
794
795 void MacroAssembler::pop_d(XMMRegister r) {
796 movdbl(r, Address(rsp, 0));
797 addptr(rsp, 2 * Interpreter::stackElementSize);
798 }
799
800 void MacroAssembler::push_ppx(Register src) {
801 if (VM_Version::supports_apx_f()) {
802 pushp(src);
803 } else {
804 Assembler::push(src);
805 }
806 }
807
808 void MacroAssembler::pop_ppx(Register dst) {
809 if (VM_Version::supports_apx_f()) {
810 popp(dst);
811 } else {
812 Assembler::pop(dst);
813 }
814 }
815
816 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
817 // Used in sign-masking with aligned address.
818 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
819 assert(rscratch != noreg || always_reachable(src), "missing");
820
821 if (UseAVX > 2 &&
822 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
823 (dst->encoding() >= 16)) {
824 vpand(dst, dst, src, AVX_512bit, rscratch);
825 } else if (reachable(src)) {
826 Assembler::andpd(dst, as_Address(src));
827 } else {
828 lea(rscratch, src);
829 Assembler::andpd(dst, Address(rscratch, 0));
830 }
831 }
832
833 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
834 // Used in sign-masking with aligned address.
835 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
836 assert(rscratch != noreg || always_reachable(src), "missing");
837
838 if (reachable(src)) {
839 Assembler::andps(dst, as_Address(src));
840 } else {
841 lea(rscratch, src);
842 Assembler::andps(dst, Address(rscratch, 0));
843 }
844 }
845
846 void MacroAssembler::andptr(Register dst, int32_t imm32) {
847 andq(dst, imm32);
848 }
849
850 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
851 assert(rscratch != noreg || always_reachable(src), "missing");
852
853 if (reachable(src)) {
854 andq(dst, as_Address(src));
855 } else {
856 lea(rscratch, src);
857 andq(dst, Address(rscratch, 0));
858 }
859 }
860
861 void MacroAssembler::atomic_incl(Address counter_addr) {
862 lock();
863 incrementl(counter_addr);
864 }
865
866 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
867 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
868
869 if (reachable(counter_addr)) {
870 atomic_incl(as_Address(counter_addr));
871 } else {
872 lea(rscratch, counter_addr);
873 atomic_incl(Address(rscratch, 0));
874 }
875 }
876
877 void MacroAssembler::atomic_incq(Address counter_addr) {
878 lock();
879 incrementq(counter_addr);
880 }
881
882 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
883 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
884
885 if (reachable(counter_addr)) {
886 atomic_incq(as_Address(counter_addr));
887 } else {
888 lea(rscratch, counter_addr);
889 atomic_incq(Address(rscratch, 0));
890 }
891 }
892
893 // Writes to stack successive pages until offset reached to check for
894 // stack overflow + shadow pages. This clobbers tmp.
895 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
896 movptr(tmp, rsp);
897 // Bang stack for total size given plus shadow page size.
898 // Bang one page at a time because large size can bang beyond yellow and
899 // red zones.
900 Label loop;
901 bind(loop);
902 movl(Address(tmp, (-(int)os::vm_page_size())), size );
903 subptr(tmp, (int)os::vm_page_size());
904 subl(size, (int)os::vm_page_size());
905 jcc(Assembler::greater, loop);
906
907 // Bang down shadow pages too.
908 // At this point, (tmp-0) is the last address touched, so don't
909 // touch it again. (It was touched as (tmp-pagesize) but then tmp
910 // was post-decremented.) Skip this address by starting at i=1, and
911 // touch a few more pages below. N.B. It is important to touch all
912 // the way down including all pages in the shadow zone.
913 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
914 // this could be any sized move but this is can be a debugging crumb
915 // so the bigger the better.
916 movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
917 }
918 }
919
920 void MacroAssembler::reserved_stack_check() {
921 // testing if reserved zone needs to be enabled
922 Label no_reserved_zone_enabling;
923
924 cmpptr(rsp, Address(r15_thread, JavaThread::reserved_stack_activation_offset()));
925 jcc(Assembler::below, no_reserved_zone_enabling);
926
927 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), r15_thread);
928 jump(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
929 should_not_reach_here();
930
931 bind(no_reserved_zone_enabling);
932 }
933
934 void MacroAssembler::c2bool(Register x) {
935 // implements x == 0 ? 0 : 1
936 // note: must only look at least-significant byte of x
937 // since C-style booleans are stored in one byte
938 // only! (was bug)
939 andl(x, 0xFF);
940 setb(Assembler::notZero, x);
941 }
942
943 // Wouldn't need if AddressLiteral version had new name
944 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
945 Assembler::call(L, rtype);
946 }
947
948 void MacroAssembler::call(Register entry) {
949 Assembler::call(entry);
950 }
951
952 void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
953 assert(rscratch != noreg || always_reachable(entry), "missing");
954
955 if (reachable(entry)) {
956 Assembler::call_literal(entry.target(), entry.rspec());
957 } else {
958 lea(rscratch, entry);
959 Assembler::call(rscratch);
960 }
961 }
962
963 void MacroAssembler::ic_call(address entry, jint method_index) {
964 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
965 // Needs full 64-bit immediate for later patching.
966 Assembler::mov64(rax, (int64_t)Universe::non_oop_word());
967 call(AddressLiteral(entry, rh));
968 }
969
970 int MacroAssembler::ic_check_size() {
971 return UseCompactObjectHeaders ? 17 : 14;
972 }
973
974 int MacroAssembler::ic_check(int end_alignment) {
975 Register receiver = j_rarg0;
976 Register data = rax;
977 Register temp = rscratch1;
978
979 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
980 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
981 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
982 // before the inline cache check here, and not after
983 align(end_alignment, offset() + ic_check_size());
984
985 int uep_offset = offset();
986
987 if (UseCompactObjectHeaders) {
988 load_narrow_klass_compact(temp, receiver);
989 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
990 } else {
991 movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
992 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
993 }
994
995 // if inline cache check fails, then jump to runtime routine
996 jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
997 assert((offset() % end_alignment) == 0, "Misaligned verified entry point (%d, %d, %d)", uep_offset, offset(), end_alignment);
998
999 return uep_offset;
1000 }
1001
1002 void MacroAssembler::emit_static_call_stub() {
1003 // Static stub relocation also tags the Method* in the code-stream.
1004 mov_metadata(rbx, (Metadata*) nullptr); // Method is zapped till fixup time.
1005 // This is recognized as unresolved by relocs/nativeinst/ic code.
1006 jump(RuntimeAddress(pc()));
1007 }
1008
1009 // Implementation of call_VM versions
1010
1011 void MacroAssembler::call_VM(Register oop_result,
1012 address entry_point,
1013 bool check_exceptions) {
1014 Label C, E;
1015 call(C, relocInfo::none);
1016 jmp(E);
1017
1018 bind(C);
1019 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1020 ret(0);
1021
1022 bind(E);
1023 }
1024
1025 void MacroAssembler::call_VM(Register oop_result,
1026 address entry_point,
1027 Register arg_1,
1028 bool check_exceptions) {
1029 Label C, E;
1030 call(C, relocInfo::none);
1031 jmp(E);
1032
1033 bind(C);
1034 pass_arg1(this, arg_1);
1035 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1036 ret(0);
1037
1038 bind(E);
1039 }
1040
1041 void MacroAssembler::call_VM(Register oop_result,
1042 address entry_point,
1043 Register arg_1,
1044 Register arg_2,
1045 bool check_exceptions) {
1046 Label C, E;
1047 call(C, relocInfo::none);
1048 jmp(E);
1049
1050 bind(C);
1051
1052 assert_different_registers(arg_1, c_rarg2);
1053
1054 pass_arg2(this, arg_2);
1055 pass_arg1(this, arg_1);
1056 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1057 ret(0);
1058
1059 bind(E);
1060 }
1061
1062 void MacroAssembler::call_VM(Register oop_result,
1063 address entry_point,
1064 Register arg_1,
1065 Register arg_2,
1066 Register arg_3,
1067 bool check_exceptions) {
1068 Label C, E;
1069 call(C, relocInfo::none);
1070 jmp(E);
1071
1072 bind(C);
1073
1074 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1075 assert_different_registers(arg_2, c_rarg3);
1076 pass_arg3(this, arg_3);
1077 pass_arg2(this, arg_2);
1078 pass_arg1(this, arg_1);
1079 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1080 ret(0);
1081
1082 bind(E);
1083 }
1084
1085 void MacroAssembler::call_VM(Register oop_result,
1086 Register last_java_sp,
1087 address entry_point,
1088 int number_of_arguments,
1089 bool check_exceptions) {
1090 call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1091 }
1092
1093 void MacroAssembler::call_VM(Register oop_result,
1094 Register last_java_sp,
1095 address entry_point,
1096 Register arg_1,
1097 bool check_exceptions) {
1098 pass_arg1(this, arg_1);
1099 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1100 }
1101
1102 void MacroAssembler::call_VM(Register oop_result,
1103 Register last_java_sp,
1104 address entry_point,
1105 Register arg_1,
1106 Register arg_2,
1107 bool check_exceptions) {
1108
1109 assert_different_registers(arg_1, c_rarg2);
1110 pass_arg2(this, arg_2);
1111 pass_arg1(this, arg_1);
1112 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1113 }
1114
1115 void MacroAssembler::call_VM(Register oop_result,
1116 Register last_java_sp,
1117 address entry_point,
1118 Register arg_1,
1119 Register arg_2,
1120 Register arg_3,
1121 bool check_exceptions) {
1122 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1123 assert_different_registers(arg_2, c_rarg3);
1124 pass_arg3(this, arg_3);
1125 pass_arg2(this, arg_2);
1126 pass_arg1(this, arg_1);
1127 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1128 }
1129
1130 void MacroAssembler::super_call_VM(Register oop_result,
1131 Register last_java_sp,
1132 address entry_point,
1133 int number_of_arguments,
1134 bool check_exceptions) {
1135 MacroAssembler::call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1136 }
1137
1138 void MacroAssembler::super_call_VM(Register oop_result,
1139 Register last_java_sp,
1140 address entry_point,
1141 Register arg_1,
1142 bool check_exceptions) {
1143 pass_arg1(this, arg_1);
1144 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1145 }
1146
1147 void MacroAssembler::super_call_VM(Register oop_result,
1148 Register last_java_sp,
1149 address entry_point,
1150 Register arg_1,
1151 Register arg_2,
1152 bool check_exceptions) {
1153
1154 assert_different_registers(arg_1, c_rarg2);
1155 pass_arg2(this, arg_2);
1156 pass_arg1(this, arg_1);
1157 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1158 }
1159
1160 void MacroAssembler::super_call_VM(Register oop_result,
1161 Register last_java_sp,
1162 address entry_point,
1163 Register arg_1,
1164 Register arg_2,
1165 Register arg_3,
1166 bool check_exceptions) {
1167 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1168 assert_different_registers(arg_2, c_rarg3);
1169 pass_arg3(this, arg_3);
1170 pass_arg2(this, arg_2);
1171 pass_arg1(this, arg_1);
1172 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1173 }
1174
1175 void MacroAssembler::call_VM_base(Register oop_result,
1176 Register last_java_sp,
1177 address entry_point,
1178 int number_of_arguments,
1179 bool check_exceptions) {
1180 Register java_thread = r15_thread;
1181
1182 // determine last_java_sp register
1183 if (!last_java_sp->is_valid()) {
1184 last_java_sp = rsp;
1185 }
1186 // debugging support
1187 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1188 #ifdef ASSERT
1189 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1190 // r12 is the heapbase.
1191 if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
1192 #endif // ASSERT
1193
1194 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1195 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1196
1197 // push java thread (becomes first argument of C function)
1198
1199 mov(c_rarg0, r15_thread);
1200
1201 // set last Java frame before call
1202 assert(last_java_sp != rbp, "can't use ebp/rbp");
1203
1204 // Only interpreter should have to set fp
1205 set_last_Java_frame(last_java_sp, rbp, nullptr, rscratch1);
1206
1207 // do the call, remove parameters
1208 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1209
1210 #ifdef ASSERT
1211 // Check that thread register is not clobbered.
1212 guarantee(java_thread != rax, "change this code");
1213 push(rax);
1214 { Label L;
1215 get_thread_slow(rax);
1216 cmpptr(java_thread, rax);
1217 jcc(Assembler::equal, L);
1218 STOP("MacroAssembler::call_VM_base: java_thread not callee saved?");
1219 bind(L);
1220 }
1221 pop(rax);
1222 #endif
1223
1224 // reset last Java frame
1225 // Only interpreter should have to clear fp
1226 reset_last_Java_frame(true);
1227
1228 // C++ interp handles this in the interpreter
1229 check_and_handle_popframe();
1230 check_and_handle_earlyret();
1231
1232 if (check_exceptions) {
1233 // check for pending exceptions (java_thread is set upon return)
1234 cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1235 // This used to conditionally jump to forward_exception however it is
1236 // possible if we relocate that the branch will not reach. So we must jump
1237 // around so we can always reach
1238
1239 Label ok;
1240 jcc(Assembler::equal, ok);
1241 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1242 bind(ok);
1243 }
1244
1245 // get oop result if there is one and reset the value in the thread
1246 if (oop_result->is_valid()) {
1247 get_vm_result_oop(oop_result);
1248 }
1249 }
1250
1251 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1252 // Calculate the value for last_Java_sp somewhat subtle.
1253 // call_VM does an intermediate call which places a return address on
1254 // the stack just under the stack pointer as the user finished with it.
1255 // This allows use to retrieve last_Java_pc from last_Java_sp[-1].
1256
1257 // We've pushed one address, correct last_Java_sp
1258 lea(rax, Address(rsp, wordSize));
1259
1260 call_VM_base(oop_result, rax, entry_point, number_of_arguments, check_exceptions);
1261 }
1262
1263 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1264 void MacroAssembler::call_VM_leaf0(address entry_point) {
1265 MacroAssembler::call_VM_leaf_base(entry_point, 0);
1266 }
1267
1268 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1269 call_VM_leaf_base(entry_point, number_of_arguments);
1270 }
1271
1272 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1273 pass_arg0(this, arg_0);
1274 call_VM_leaf(entry_point, 1);
1275 }
1276
1277 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1278
1279 assert_different_registers(arg_0, c_rarg1);
1280 pass_arg1(this, arg_1);
1281 pass_arg0(this, arg_0);
1282 call_VM_leaf(entry_point, 2);
1283 }
1284
1285 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1286 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1287 assert_different_registers(arg_1, c_rarg2);
1288 pass_arg2(this, arg_2);
1289 pass_arg1(this, arg_1);
1290 pass_arg0(this, arg_0);
1291 call_VM_leaf(entry_point, 3);
1292 }
1293
1294 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1295 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1296 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1297 assert_different_registers(arg_2, c_rarg3);
1298 pass_arg3(this, arg_3);
1299 pass_arg2(this, arg_2);
1300 pass_arg1(this, arg_1);
1301 pass_arg0(this, arg_0);
1302 call_VM_leaf(entry_point, 3);
1303 }
1304
1305 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1306 pass_arg0(this, arg_0);
1307 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1308 }
1309
1310 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1311 assert_different_registers(arg_0, c_rarg1);
1312 pass_arg1(this, arg_1);
1313 pass_arg0(this, arg_0);
1314 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1315 }
1316
1317 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1318 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1319 assert_different_registers(arg_1, c_rarg2);
1320 pass_arg2(this, arg_2);
1321 pass_arg1(this, arg_1);
1322 pass_arg0(this, arg_0);
1323 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1324 }
1325
1326 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1327 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1328 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1329 assert_different_registers(arg_2, c_rarg3);
1330 pass_arg3(this, arg_3);
1331 pass_arg2(this, arg_2);
1332 pass_arg1(this, arg_1);
1333 pass_arg0(this, arg_0);
1334 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1335 }
1336
1337 void MacroAssembler::get_vm_result_oop(Register oop_result) {
1338 movptr(oop_result, Address(r15_thread, JavaThread::vm_result_oop_offset()));
1339 movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
1340 verify_oop_msg(oop_result, "broken oop in call_VM_base");
1341 }
1342
1343 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
1344 movptr(metadata_result, Address(r15_thread, JavaThread::vm_result_metadata_offset()));
1345 movptr(Address(r15_thread, JavaThread::vm_result_metadata_offset()), NULL_WORD);
1346 }
1347
1348 void MacroAssembler::check_and_handle_earlyret() {
1349 }
1350
1351 void MacroAssembler::check_and_handle_popframe() {
1352 }
1353
1354 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1355 assert(rscratch != noreg || always_reachable(src1), "missing");
1356
1357 if (reachable(src1)) {
1358 cmpl(as_Address(src1), imm);
1359 } else {
1360 lea(rscratch, src1);
1361 cmpl(Address(rscratch, 0), imm);
1362 }
1363 }
1364
1365 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1366 assert(!src2.is_lval(), "use cmpptr");
1367 assert(rscratch != noreg || always_reachable(src2), "missing");
1368
1369 if (reachable(src2)) {
1370 cmpl(src1, as_Address(src2));
1371 } else {
1372 lea(rscratch, src2);
1373 cmpl(src1, Address(rscratch, 0));
1374 }
1375 }
1376
1377 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1378 Assembler::cmpl(src1, imm);
1379 }
1380
1381 void MacroAssembler::cmp32(Register src1, Address src2) {
1382 Assembler::cmpl(src1, src2);
1383 }
1384
1385 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1386 ucomisd(opr1, opr2);
1387
1388 Label L;
1389 if (unordered_is_less) {
1390 movl(dst, -1);
1391 jcc(Assembler::parity, L);
1392 jcc(Assembler::below , L);
1393 movl(dst, 0);
1394 jcc(Assembler::equal , L);
1395 increment(dst);
1396 } else { // unordered is greater
1397 movl(dst, 1);
1398 jcc(Assembler::parity, L);
1399 jcc(Assembler::above , L);
1400 movl(dst, 0);
1401 jcc(Assembler::equal , L);
1402 decrementl(dst);
1403 }
1404 bind(L);
1405 }
1406
1407 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1408 ucomiss(opr1, opr2);
1409
1410 Label L;
1411 if (unordered_is_less) {
1412 movl(dst, -1);
1413 jcc(Assembler::parity, L);
1414 jcc(Assembler::below , L);
1415 movl(dst, 0);
1416 jcc(Assembler::equal , L);
1417 increment(dst);
1418 } else { // unordered is greater
1419 movl(dst, 1);
1420 jcc(Assembler::parity, L);
1421 jcc(Assembler::above , L);
1422 movl(dst, 0);
1423 jcc(Assembler::equal , L);
1424 decrementl(dst);
1425 }
1426 bind(L);
1427 }
1428
1429
1430 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1431 assert(rscratch != noreg || always_reachable(src1), "missing");
1432
1433 if (reachable(src1)) {
1434 cmpb(as_Address(src1), imm);
1435 } else {
1436 lea(rscratch, src1);
1437 cmpb(Address(rscratch, 0), imm);
1438 }
1439 }
1440
1441 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1442 assert(rscratch != noreg || always_reachable(src2), "missing");
1443
1444 if (src2.is_lval()) {
1445 movptr(rscratch, src2);
1446 Assembler::cmpq(src1, rscratch);
1447 } else if (reachable(src2)) {
1448 cmpq(src1, as_Address(src2));
1449 } else {
1450 lea(rscratch, src2);
1451 Assembler::cmpq(src1, Address(rscratch, 0));
1452 }
1453 }
1454
1455 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1456 assert(src2.is_lval(), "not a mem-mem compare");
1457 // moves src2's literal address
1458 movptr(rscratch, src2);
1459 Assembler::cmpq(src1, rscratch);
1460 }
1461
1462 void MacroAssembler::cmpoop(Register src1, Register src2) {
1463 cmpptr(src1, src2);
1464 }
1465
1466 void MacroAssembler::cmpoop(Register src1, Address src2) {
1467 cmpptr(src1, src2);
1468 }
1469
1470 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1471 movoop(rscratch, src2);
1472 cmpptr(src1, rscratch);
1473 }
1474
1475 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1476 assert(rscratch != noreg || always_reachable(adr), "missing");
1477
1478 if (reachable(adr)) {
1479 lock();
1480 cmpxchgptr(reg, as_Address(adr));
1481 } else {
1482 lea(rscratch, adr);
1483 lock();
1484 cmpxchgptr(reg, Address(rscratch, 0));
1485 }
1486 }
1487
1488 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1489 cmpxchgq(reg, adr);
1490 }
1491
1492 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1493 assert(rscratch != noreg || always_reachable(src), "missing");
1494
1495 if (reachable(src)) {
1496 Assembler::comisd(dst, as_Address(src));
1497 } else {
1498 lea(rscratch, src);
1499 Assembler::comisd(dst, Address(rscratch, 0));
1500 }
1501 }
1502
1503 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1504 assert(rscratch != noreg || always_reachable(src), "missing");
1505
1506 if (reachable(src)) {
1507 Assembler::comiss(dst, as_Address(src));
1508 } else {
1509 lea(rscratch, src);
1510 Assembler::comiss(dst, Address(rscratch, 0));
1511 }
1512 }
1513
1514
1515 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1516 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1517
1518 Condition negated_cond = negate_condition(cond);
1519 Label L;
1520 jcc(negated_cond, L);
1521 pushf(); // Preserve flags
1522 atomic_incl(counter_addr, rscratch);
1523 popf();
1524 bind(L);
1525 }
1526
1527 int MacroAssembler::corrected_idivl(Register reg) {
1528 // Full implementation of Java idiv and irem; checks for
1529 // special case as described in JVM spec., p.243 & p.271.
1530 // The function returns the (pc) offset of the idivl
1531 // instruction - may be needed for implicit exceptions.
1532 //
1533 // normal case special case
1534 //
1535 // input : rax,: dividend min_int
1536 // reg: divisor (may not be rax,/rdx) -1
1537 //
1538 // output: rax,: quotient (= rax, idiv reg) min_int
1539 // rdx: remainder (= rax, irem reg) 0
1540 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1541 const int min_int = 0x80000000;
1542 Label normal_case, special_case;
1543
1544 // check for special case
1545 cmpl(rax, min_int);
1546 jcc(Assembler::notEqual, normal_case);
1547 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1548 cmpl(reg, -1);
1549 jcc(Assembler::equal, special_case);
1550
1551 // handle normal case
1552 bind(normal_case);
1553 cdql();
1554 int idivl_offset = offset();
1555 idivl(reg);
1556
1557 // normal and special case exit
1558 bind(special_case);
1559
1560 return idivl_offset;
1561 }
1562
1563
1564
1565 void MacroAssembler::decrementl(Register reg, int value) {
1566 if (value == min_jint) {subl(reg, value) ; return; }
1567 if (value < 0) { incrementl(reg, -value); return; }
1568 if (value == 0) { ; return; }
1569 if (value == 1 && UseIncDec) { decl(reg) ; return; }
1570 /* else */ { subl(reg, value) ; return; }
1571 }
1572
1573 void MacroAssembler::decrementl(Address dst, int value) {
1574 if (value == min_jint) {subl(dst, value) ; return; }
1575 if (value < 0) { incrementl(dst, -value); return; }
1576 if (value == 0) { ; return; }
1577 if (value == 1 && UseIncDec) { decl(dst) ; return; }
1578 /* else */ { subl(dst, value) ; return; }
1579 }
1580
1581 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1582 assert(shift_value > 0, "illegal shift value");
1583 Label _is_positive;
1584 testl (reg, reg);
1585 jcc (Assembler::positive, _is_positive);
1586 int offset = (1 << shift_value) - 1 ;
1587
1588 if (offset == 1) {
1589 incrementl(reg);
1590 } else {
1591 addl(reg, offset);
1592 }
1593
1594 bind (_is_positive);
1595 sarl(reg, shift_value);
1596 }
1597
1598 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1599 assert(rscratch != noreg || always_reachable(src), "missing");
1600
1601 if (reachable(src)) {
1602 Assembler::divsd(dst, as_Address(src));
1603 } else {
1604 lea(rscratch, src);
1605 Assembler::divsd(dst, Address(rscratch, 0));
1606 }
1607 }
1608
1609 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1610 assert(rscratch != noreg || always_reachable(src), "missing");
1611
1612 if (reachable(src)) {
1613 Assembler::divss(dst, as_Address(src));
1614 } else {
1615 lea(rscratch, src);
1616 Assembler::divss(dst, Address(rscratch, 0));
1617 }
1618 }
1619
1620 void MacroAssembler::enter() {
1621 push(rbp);
1622 mov(rbp, rsp);
1623 }
1624
1625 void MacroAssembler::post_call_nop() {
1626 if (!Continuations::enabled()) {
1627 return;
1628 }
1629 InstructionMark im(this);
1630 relocate(post_call_nop_Relocation::spec());
1631 InlineSkippedInstructionsCounter skipCounter(this);
1632 emit_int8((uint8_t)0x0f);
1633 emit_int8((uint8_t)0x1f);
1634 emit_int8((uint8_t)0x84);
1635 emit_int8((uint8_t)0x00);
1636 emit_int32(0x00);
1637 }
1638
1639 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1640 assert(rscratch != noreg || always_reachable(src), "missing");
1641 if (reachable(src)) {
1642 Assembler::mulpd(dst, as_Address(src));
1643 } else {
1644 lea(rscratch, src);
1645 Assembler::mulpd(dst, Address(rscratch, 0));
1646 }
1647 }
1648
1649 // dst = c = a * b + c
1650 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1651 Assembler::vfmadd231sd(c, a, b);
1652 if (dst != c) {
1653 movdbl(dst, c);
1654 }
1655 }
1656
1657 // dst = c = a * b + c
1658 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1659 Assembler::vfmadd231ss(c, a, b);
1660 if (dst != c) {
1661 movflt(dst, c);
1662 }
1663 }
1664
1665 // dst = c = a * b + c
1666 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1667 Assembler::vfmadd231pd(c, a, b, vector_len);
1668 if (dst != c) {
1669 vmovdqu(dst, c);
1670 }
1671 }
1672
1673 // dst = c = a * b + c
1674 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1675 Assembler::vfmadd231ps(c, a, b, vector_len);
1676 if (dst != c) {
1677 vmovdqu(dst, c);
1678 }
1679 }
1680
1681 // dst = c = a * b + c
1682 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1683 Assembler::vfmadd231pd(c, a, b, vector_len);
1684 if (dst != c) {
1685 vmovdqu(dst, c);
1686 }
1687 }
1688
1689 // dst = c = a * b + c
1690 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1691 Assembler::vfmadd231ps(c, a, b, vector_len);
1692 if (dst != c) {
1693 vmovdqu(dst, c);
1694 }
1695 }
1696
1697 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
1698 assert(rscratch != noreg || always_reachable(dst), "missing");
1699
1700 if (reachable(dst)) {
1701 incrementl(as_Address(dst));
1702 } else {
1703 lea(rscratch, dst);
1704 incrementl(Address(rscratch, 0));
1705 }
1706 }
1707
1708 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
1709 incrementl(as_Address(dst, rscratch));
1710 }
1711
1712 void MacroAssembler::incrementl(Register reg, int value) {
1713 if (value == min_jint) {addl(reg, value) ; return; }
1714 if (value < 0) { decrementl(reg, -value); return; }
1715 if (value == 0) { ; return; }
1716 if (value == 1 && UseIncDec) { incl(reg) ; return; }
1717 /* else */ { addl(reg, value) ; return; }
1718 }
1719
1720 void MacroAssembler::incrementl(Address dst, int value) {
1721 if (value == min_jint) {addl(dst, value) ; return; }
1722 if (value < 0) { decrementl(dst, -value); return; }
1723 if (value == 0) { ; return; }
1724 if (value == 1 && UseIncDec) { incl(dst) ; return; }
1725 /* else */ { addl(dst, value) ; return; }
1726 }
1727
1728 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
1729 assert(rscratch != noreg || always_reachable(dst), "missing");
1730 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump");
1731 if (reachable(dst)) {
1732 jmp_literal(dst.target(), dst.rspec());
1733 } else {
1734 lea(rscratch, dst);
1735 jmp(rscratch);
1736 }
1737 }
1738
1739 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
1740 assert(rscratch != noreg || always_reachable(dst), "missing");
1741 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump_cc");
1742 if (reachable(dst)) {
1743 InstructionMark im(this);
1744 relocate(dst.reloc());
1745 const int short_size = 2;
1746 const int long_size = 6;
1747 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
1748 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
1749 // 0111 tttn #8-bit disp
1750 emit_int8(0x70 | cc);
1751 emit_int8((offs - short_size) & 0xFF);
1752 } else {
1753 // 0000 1111 1000 tttn #32-bit disp
1754 emit_int8(0x0F);
1755 emit_int8((unsigned char)(0x80 | cc));
1756 emit_int32(offs - long_size);
1757 }
1758 } else {
1759 #ifdef ASSERT
1760 warning("reversing conditional branch");
1761 #endif /* ASSERT */
1762 Label skip;
1763 jccb(reverse[cc], skip);
1764 lea(rscratch, dst);
1765 Assembler::jmp(rscratch);
1766 bind(skip);
1767 }
1768 }
1769
1770 void MacroAssembler::cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch) {
1771 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
1772 assert(rscratch != noreg || always_reachable(mxcsr_std), "missing");
1773
1774 stmxcsr(mxcsr_save);
1775 movl(tmp, mxcsr_save);
1776 if (EnableX86ECoreOpts) {
1777 // The mxcsr_std has status bits set for performance on ECore
1778 orl(tmp, 0x003f);
1779 } else {
1780 // Mask out status bits (only check control and mask bits)
1781 andl(tmp, 0xFFC0);
1782 }
1783 cmp32(tmp, mxcsr_std, rscratch);
1784 }
1785
1786 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
1787 assert(rscratch != noreg || always_reachable(src), "missing");
1788
1789 if (reachable(src)) {
1790 Assembler::ldmxcsr(as_Address(src));
1791 } else {
1792 lea(rscratch, src);
1793 Assembler::ldmxcsr(Address(rscratch, 0));
1794 }
1795 }
1796
1797 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1798 int off = offset();
1799 movsbl(dst, src); // movsxb
1800 return off;
1801 }
1802
1803 // Note: load_signed_short used to be called load_signed_word.
1804 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
1805 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
1806 // The term "word" in HotSpot means a 32- or 64-bit machine word.
1807 int MacroAssembler::load_signed_short(Register dst, Address src) {
1808 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
1809 // version but this is what 64bit has always done. This seems to imply
1810 // that users are only using 32bits worth.
1811 int off = offset();
1812 movswl(dst, src); // movsxw
1813 return off;
1814 }
1815
1816 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1817 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1818 // and "3.9 Partial Register Penalties", p. 22).
1819 int off = offset();
1820 movzbl(dst, src); // movzxb
1821 return off;
1822 }
1823
1824 // Note: load_unsigned_short used to be called load_unsigned_word.
1825 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1826 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1827 // and "3.9 Partial Register Penalties", p. 22).
1828 int off = offset();
1829 movzwl(dst, src); // movzxw
1830 return off;
1831 }
1832
1833 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1834 switch (size_in_bytes) {
1835 case 8: movq(dst, src); break;
1836 case 4: movl(dst, src); break;
1837 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1838 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1839 default: ShouldNotReachHere();
1840 }
1841 }
1842
1843 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1844 switch (size_in_bytes) {
1845 case 8: movq(dst, src); break;
1846 case 4: movl(dst, src); break;
1847 case 2: movw(dst, src); break;
1848 case 1: movb(dst, src); break;
1849 default: ShouldNotReachHere();
1850 }
1851 }
1852
1853 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
1854 assert(rscratch != noreg || always_reachable(dst), "missing");
1855
1856 if (reachable(dst)) {
1857 movl(as_Address(dst), src);
1858 } else {
1859 lea(rscratch, dst);
1860 movl(Address(rscratch, 0), src);
1861 }
1862 }
1863
1864 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
1865 if (reachable(src)) {
1866 movl(dst, as_Address(src));
1867 } else {
1868 lea(dst, src);
1869 movl(dst, Address(dst, 0));
1870 }
1871 }
1872
1873 // C++ bool manipulation
1874
1875 void MacroAssembler::movbool(Register dst, Address src) {
1876 if(sizeof(bool) == 1)
1877 movb(dst, src);
1878 else if(sizeof(bool) == 2)
1879 movw(dst, src);
1880 else if(sizeof(bool) == 4)
1881 movl(dst, src);
1882 else
1883 // unsupported
1884 ShouldNotReachHere();
1885 }
1886
1887 void MacroAssembler::movbool(Address dst, bool boolconst) {
1888 if(sizeof(bool) == 1)
1889 movb(dst, (int) boolconst);
1890 else if(sizeof(bool) == 2)
1891 movw(dst, (int) boolconst);
1892 else if(sizeof(bool) == 4)
1893 movl(dst, (int) boolconst);
1894 else
1895 // unsupported
1896 ShouldNotReachHere();
1897 }
1898
1899 void MacroAssembler::movbool(Address dst, Register src) {
1900 if(sizeof(bool) == 1)
1901 movb(dst, src);
1902 else if(sizeof(bool) == 2)
1903 movw(dst, src);
1904 else if(sizeof(bool) == 4)
1905 movl(dst, src);
1906 else
1907 // unsupported
1908 ShouldNotReachHere();
1909 }
1910
1911 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1912 assert(rscratch != noreg || always_reachable(src), "missing");
1913
1914 if (reachable(src)) {
1915 movdl(dst, as_Address(src));
1916 } else {
1917 lea(rscratch, src);
1918 movdl(dst, Address(rscratch, 0));
1919 }
1920 }
1921
1922 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
1923 assert(rscratch != noreg || always_reachable(src), "missing");
1924
1925 if (reachable(src)) {
1926 movq(dst, as_Address(src));
1927 } else {
1928 lea(rscratch, src);
1929 movq(dst, Address(rscratch, 0));
1930 }
1931 }
1932
1933 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1934 assert(rscratch != noreg || always_reachable(src), "missing");
1935
1936 if (reachable(src)) {
1937 if (UseXmmLoadAndClearUpper) {
1938 movsd (dst, as_Address(src));
1939 } else {
1940 movlpd(dst, as_Address(src));
1941 }
1942 } else {
1943 lea(rscratch, src);
1944 if (UseXmmLoadAndClearUpper) {
1945 movsd (dst, Address(rscratch, 0));
1946 } else {
1947 movlpd(dst, Address(rscratch, 0));
1948 }
1949 }
1950 }
1951
1952 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
1953 assert(rscratch != noreg || always_reachable(src), "missing");
1954
1955 if (reachable(src)) {
1956 movss(dst, as_Address(src));
1957 } else {
1958 lea(rscratch, src);
1959 movss(dst, Address(rscratch, 0));
1960 }
1961 }
1962
1963 void MacroAssembler::movhlf(XMMRegister dst, XMMRegister src, Register rscratch) {
1964 if (VM_Version::supports_avx10_2()) {
1965 evmovw(dst, src);
1966 } else {
1967 assert(rscratch != noreg, "missing");
1968 evmovw(rscratch, src);
1969 evmovw(dst, rscratch);
1970 }
1971 }
1972
1973 void MacroAssembler::mov64(Register dst, int64_t imm64) {
1974 if (is_uimm32(imm64)) {
1975 movl(dst, checked_cast<uint32_t>(imm64));
1976 } else if (is_simm32(imm64)) {
1977 movq(dst, checked_cast<int32_t>(imm64));
1978 } else {
1979 Assembler::mov64(dst, imm64);
1980 }
1981 }
1982
1983 void MacroAssembler::mov64(Register dst, int64_t imm64, relocInfo::relocType rtype, int format) {
1984 Assembler::mov64(dst, imm64, rtype, format);
1985 }
1986
1987 void MacroAssembler::movptr(Register dst, Register src) {
1988 movq(dst, src);
1989 }
1990
1991 void MacroAssembler::movptr(Register dst, Address src) {
1992 movq(dst, src);
1993 }
1994
1995 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
1996 void MacroAssembler::movptr(Register dst, intptr_t src) {
1997 mov64(dst, src);
1998 }
1999
2000 void MacroAssembler::movptr(Address dst, Register src) {
2001 movq(dst, src);
2002 }
2003
2004 void MacroAssembler::movptr(Address dst, int32_t src) {
2005 movslq(dst, src);
2006 }
2007
2008 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2009 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2010 Assembler::movdqu(dst, src);
2011 }
2012
2013 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2014 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2015 Assembler::movdqu(dst, src);
2016 }
2017
2018 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2019 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2020 Assembler::movdqu(dst, src);
2021 }
2022
2023 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2024 assert(rscratch != noreg || always_reachable(src), "missing");
2025
2026 if (reachable(src)) {
2027 movdqu(dst, as_Address(src));
2028 } else {
2029 lea(rscratch, src);
2030 movdqu(dst, Address(rscratch, 0));
2031 }
2032 }
2033
2034 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2035 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2036 Assembler::vmovdqu(dst, src);
2037 }
2038
2039 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2040 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2041 Assembler::vmovdqu(dst, src);
2042 }
2043
2044 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2045 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2046 Assembler::vmovdqu(dst, src);
2047 }
2048
2049 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2050 assert(rscratch != noreg || always_reachable(src), "missing");
2051
2052 if (reachable(src)) {
2053 vmovdqu(dst, as_Address(src));
2054 }
2055 else {
2056 lea(rscratch, src);
2057 vmovdqu(dst, Address(rscratch, 0));
2058 }
2059 }
2060
2061 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2062 assert(rscratch != noreg || always_reachable(src), "missing");
2063
2064 if (vector_len == AVX_512bit) {
2065 evmovdquq(dst, src, AVX_512bit, rscratch);
2066 } else if (vector_len == AVX_256bit) {
2067 vmovdqu(dst, src, rscratch);
2068 } else {
2069 movdqu(dst, src, rscratch);
2070 }
2071 }
2072
2073 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
2074 if (vector_len == AVX_512bit) {
2075 evmovdquq(dst, src, AVX_512bit);
2076 } else if (vector_len == AVX_256bit) {
2077 vmovdqu(dst, src);
2078 } else {
2079 movdqu(dst, src);
2080 }
2081 }
2082
2083 void MacroAssembler::vmovdqu(Address dst, XMMRegister src, int vector_len) {
2084 if (vector_len == AVX_512bit) {
2085 evmovdquq(dst, src, AVX_512bit);
2086 } else if (vector_len == AVX_256bit) {
2087 vmovdqu(dst, src);
2088 } else {
2089 movdqu(dst, src);
2090 }
2091 }
2092
2093 void MacroAssembler::vmovdqu(XMMRegister dst, Address src, int vector_len) {
2094 if (vector_len == AVX_512bit) {
2095 evmovdquq(dst, src, AVX_512bit);
2096 } else if (vector_len == AVX_256bit) {
2097 vmovdqu(dst, src);
2098 } else {
2099 movdqu(dst, src);
2100 }
2101 }
2102
2103 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2104 assert(rscratch != noreg || always_reachable(src), "missing");
2105
2106 if (reachable(src)) {
2107 vmovdqa(dst, as_Address(src));
2108 }
2109 else {
2110 lea(rscratch, src);
2111 vmovdqa(dst, Address(rscratch, 0));
2112 }
2113 }
2114
2115 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2116 assert(rscratch != noreg || always_reachable(src), "missing");
2117
2118 if (vector_len == AVX_512bit) {
2119 evmovdqaq(dst, src, AVX_512bit, rscratch);
2120 } else if (vector_len == AVX_256bit) {
2121 vmovdqa(dst, src, rscratch);
2122 } else {
2123 movdqa(dst, src, rscratch);
2124 }
2125 }
2126
2127 void MacroAssembler::vmovdqa(XMMRegister dst, Address src, int vector_len) {
2128 if (vector_len == AVX_512bit) {
2129 Assembler::evmovdqaq(dst, src, AVX_512bit);
2130 } else if (vector_len == AVX_256bit) {
2131 Assembler::vmovdqa(dst, src);
2132 } else {
2133 Assembler::movdqa(dst, src);
2134 }
2135 }
2136
2137 void MacroAssembler::vmovdqa(Address dst, XMMRegister src, int vector_len) {
2138 if (vector_len == AVX_512bit) {
2139 Assembler::evmovdqaq(dst, src, AVX_512bit);
2140 } else if (vector_len == AVX_256bit) {
2141 Assembler::vmovdqa(dst, src);
2142 } else {
2143 Assembler::movdqa(dst, src);
2144 }
2145 }
2146
2147 void MacroAssembler::kmov(KRegister dst, Address src) {
2148 if (VM_Version::supports_avx512bw()) {
2149 kmovql(dst, src);
2150 } else {
2151 assert(VM_Version::supports_evex(), "");
2152 kmovwl(dst, src);
2153 }
2154 }
2155
2156 void MacroAssembler::kmov(Address dst, KRegister src) {
2157 if (VM_Version::supports_avx512bw()) {
2158 kmovql(dst, src);
2159 } else {
2160 assert(VM_Version::supports_evex(), "");
2161 kmovwl(dst, src);
2162 }
2163 }
2164
2165 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2166 if (VM_Version::supports_avx512bw()) {
2167 kmovql(dst, src);
2168 } else {
2169 assert(VM_Version::supports_evex(), "");
2170 kmovwl(dst, src);
2171 }
2172 }
2173
2174 void MacroAssembler::kmov(Register dst, KRegister src) {
2175 if (VM_Version::supports_avx512bw()) {
2176 kmovql(dst, src);
2177 } else {
2178 assert(VM_Version::supports_evex(), "");
2179 kmovwl(dst, src);
2180 }
2181 }
2182
2183 void MacroAssembler::kmov(KRegister dst, Register src) {
2184 if (VM_Version::supports_avx512bw()) {
2185 kmovql(dst, src);
2186 } else {
2187 assert(VM_Version::supports_evex(), "");
2188 kmovwl(dst, src);
2189 }
2190 }
2191
2192 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2193 assert(rscratch != noreg || always_reachable(src), "missing");
2194
2195 if (reachable(src)) {
2196 kmovql(dst, as_Address(src));
2197 } else {
2198 lea(rscratch, src);
2199 kmovql(dst, Address(rscratch, 0));
2200 }
2201 }
2202
2203 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2204 assert(rscratch != noreg || always_reachable(src), "missing");
2205
2206 if (reachable(src)) {
2207 kmovwl(dst, as_Address(src));
2208 } else {
2209 lea(rscratch, src);
2210 kmovwl(dst, Address(rscratch, 0));
2211 }
2212 }
2213
2214 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2215 int vector_len, Register rscratch) {
2216 assert(rscratch != noreg || always_reachable(src), "missing");
2217
2218 if (reachable(src)) {
2219 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2220 } else {
2221 lea(rscratch, src);
2222 Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2223 }
2224 }
2225
2226 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2227 int vector_len, Register rscratch) {
2228 assert(rscratch != noreg || always_reachable(src), "missing");
2229
2230 if (reachable(src)) {
2231 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2232 } else {
2233 lea(rscratch, src);
2234 Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2235 }
2236 }
2237
2238 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2239 assert(rscratch != noreg || always_reachable(src), "missing");
2240
2241 if (reachable(src)) {
2242 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2243 } else {
2244 lea(rscratch, src);
2245 Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2246 }
2247 }
2248
2249 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2250 assert(rscratch != noreg || always_reachable(src), "missing");
2251
2252 if (reachable(src)) {
2253 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2254 } else {
2255 lea(rscratch, src);
2256 Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2257 }
2258 }
2259
2260 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2261 assert(rscratch != noreg || always_reachable(src), "missing");
2262
2263 if (reachable(src)) {
2264 Assembler::evmovdquq(dst, as_Address(src), vector_len);
2265 } else {
2266 lea(rscratch, src);
2267 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2268 }
2269 }
2270
2271 void MacroAssembler::evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2272 assert(rscratch != noreg || always_reachable(src), "missing");
2273
2274 if (reachable(src)) {
2275 Assembler::evmovdqaq(dst, mask, as_Address(src), merge, vector_len);
2276 } else {
2277 lea(rscratch, src);
2278 Assembler::evmovdqaq(dst, mask, Address(rscratch, 0), merge, vector_len);
2279 }
2280 }
2281
2282 void MacroAssembler::evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2283 assert(rscratch != noreg || always_reachable(src), "missing");
2284
2285 if (reachable(src)) {
2286 Assembler::evmovdqaq(dst, as_Address(src), vector_len);
2287 } else {
2288 lea(rscratch, src);
2289 Assembler::evmovdqaq(dst, Address(rscratch, 0), vector_len);
2290 }
2291 }
2292
2293 void MacroAssembler::movapd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2294 assert(rscratch != noreg || always_reachable(src), "missing");
2295
2296 if (reachable(src)) {
2297 Assembler::movapd(dst, as_Address(src));
2298 } else {
2299 lea(rscratch, src);
2300 Assembler::movapd(dst, Address(rscratch, 0));
2301 }
2302 }
2303
2304 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2305 assert(rscratch != noreg || always_reachable(src), "missing");
2306
2307 if (reachable(src)) {
2308 Assembler::movdqa(dst, as_Address(src));
2309 } else {
2310 lea(rscratch, src);
2311 Assembler::movdqa(dst, Address(rscratch, 0));
2312 }
2313 }
2314
2315 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2316 assert(rscratch != noreg || always_reachable(src), "missing");
2317
2318 if (reachable(src)) {
2319 Assembler::movsd(dst, as_Address(src));
2320 } else {
2321 lea(rscratch, src);
2322 Assembler::movsd(dst, Address(rscratch, 0));
2323 }
2324 }
2325
2326 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2327 assert(rscratch != noreg || always_reachable(src), "missing");
2328
2329 if (reachable(src)) {
2330 Assembler::movss(dst, as_Address(src));
2331 } else {
2332 lea(rscratch, src);
2333 Assembler::movss(dst, Address(rscratch, 0));
2334 }
2335 }
2336
2337 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2338 assert(rscratch != noreg || always_reachable(src), "missing");
2339
2340 if (reachable(src)) {
2341 Assembler::movddup(dst, as_Address(src));
2342 } else {
2343 lea(rscratch, src);
2344 Assembler::movddup(dst, Address(rscratch, 0));
2345 }
2346 }
2347
2348 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2349 assert(rscratch != noreg || always_reachable(src), "missing");
2350
2351 if (reachable(src)) {
2352 Assembler::vmovddup(dst, as_Address(src), vector_len);
2353 } else {
2354 lea(rscratch, src);
2355 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2356 }
2357 }
2358
2359 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2360 assert(rscratch != noreg || always_reachable(src), "missing");
2361
2362 if (reachable(src)) {
2363 Assembler::mulsd(dst, as_Address(src));
2364 } else {
2365 lea(rscratch, src);
2366 Assembler::mulsd(dst, Address(rscratch, 0));
2367 }
2368 }
2369
2370 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2371 assert(rscratch != noreg || always_reachable(src), "missing");
2372
2373 if (reachable(src)) {
2374 Assembler::mulss(dst, as_Address(src));
2375 } else {
2376 lea(rscratch, src);
2377 Assembler::mulss(dst, Address(rscratch, 0));
2378 }
2379 }
2380
2381 void MacroAssembler::null_check(Register reg, int offset) {
2382 if (needs_explicit_null_check(offset)) {
2383 // provoke OS null exception if reg is null by
2384 // accessing M[reg] w/o changing any (non-CC) registers
2385 // NOTE: cmpl is plenty here to provoke a segv
2386 cmpptr(rax, Address(reg, 0));
2387 // Note: should probably use testl(rax, Address(reg, 0));
2388 // may be shorter code (however, this version of
2389 // testl needs to be implemented first)
2390 } else {
2391 // nothing to do, (later) access of M[reg + offset]
2392 // will provoke OS null exception if reg is null
2393 }
2394 }
2395
2396 void MacroAssembler::os_breakpoint() {
2397 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2398 // (e.g., MSVC can't call ps() otherwise)
2399 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2400 }
2401
2402 void MacroAssembler::unimplemented(const char* what) {
2403 const char* buf = nullptr;
2404 {
2405 ResourceMark rm;
2406 stringStream ss;
2407 ss.print("unimplemented: %s", what);
2408 buf = code_string(ss.as_string());
2409 }
2410 stop(buf);
2411 }
2412
2413 #define XSTATE_BV 0x200
2414
2415 void MacroAssembler::pop_CPU_state() {
2416 pop_FPU_state();
2417 pop_IU_state();
2418 }
2419
2420 void MacroAssembler::pop_FPU_state() {
2421 fxrstor(Address(rsp, 0));
2422 addptr(rsp, FPUStateSizeInWords * wordSize);
2423 }
2424
2425 void MacroAssembler::pop_IU_state() {
2426 popa();
2427 addq(rsp, 8);
2428 popf();
2429 }
2430
2431 // Save Integer and Float state
2432 // Warning: Stack must be 16 byte aligned (64bit)
2433 void MacroAssembler::push_CPU_state() {
2434 push_IU_state();
2435 push_FPU_state();
2436 }
2437
2438 void MacroAssembler::push_FPU_state() {
2439 subptr(rsp, FPUStateSizeInWords * wordSize);
2440 fxsave(Address(rsp, 0));
2441 }
2442
2443 void MacroAssembler::push_IU_state() {
2444 // Push flags first because pusha kills them
2445 pushf();
2446 // Make sure rsp stays 16-byte aligned
2447 subq(rsp, 8);
2448 pusha();
2449 }
2450
2451 void MacroAssembler::push_cont_fastpath() {
2452 if (!Continuations::enabled()) return;
2453
2454 Label L_done;
2455 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2456 jccb(Assembler::belowEqual, L_done);
2457 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rsp);
2458 bind(L_done);
2459 }
2460
2461 void MacroAssembler::pop_cont_fastpath() {
2462 if (!Continuations::enabled()) return;
2463
2464 Label L_done;
2465 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2466 jccb(Assembler::below, L_done);
2467 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
2468 bind(L_done);
2469 }
2470
2471 #ifdef ASSERT
2472 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
2473 Label no_cont;
2474 movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
2475 testl(cont, cont);
2476 jcc(Assembler::zero, no_cont);
2477 stop(name);
2478 bind(no_cont);
2479 }
2480 #endif
2481
2482 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { // determine java_thread register
2483 // we must set sp to zero to clear frame
2484 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2485 // must clear fp, so that compiled frames are not confused; it is
2486 // possible that we need it only for debugging
2487 if (clear_fp) {
2488 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2489 }
2490 // Always clear the pc because it could have been set by make_walkable()
2491 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2492 vzeroupper();
2493 }
2494
2495 void MacroAssembler::round_to(Register reg, int modulus) {
2496 addptr(reg, modulus - 1);
2497 andptr(reg, -modulus);
2498 }
2499
2500 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod) {
2501 if (at_return) {
2502 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2503 // we may safely use rsp instead to perform the stack watermark check.
2504 cmpptr(in_nmethod ? rsp : rbp, Address(r15_thread, JavaThread::polling_word_offset()));
2505 jcc(Assembler::above, slow_path);
2506 return;
2507 }
2508 testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2509 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2510 }
2511
2512 // Calls to C land
2513 //
2514 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2515 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2516 // has to be reset to 0. This is required to allow proper stack traversal.
2517 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2518 Register last_java_fp,
2519 address last_java_pc,
2520 Register rscratch) {
2521 vzeroupper();
2522 // determine last_java_sp register
2523 if (!last_java_sp->is_valid()) {
2524 last_java_sp = rsp;
2525 }
2526 // last_java_fp is optional
2527 if (last_java_fp->is_valid()) {
2528 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2529 }
2530 // last_java_pc is optional
2531 if (last_java_pc != nullptr) {
2532 Address java_pc(r15_thread,
2533 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
2534 lea(java_pc, InternalAddress(last_java_pc), rscratch);
2535 }
2536 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2537 }
2538
2539 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2540 Register last_java_fp,
2541 Label &L,
2542 Register scratch) {
2543 lea(scratch, L);
2544 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), scratch);
2545 set_last_Java_frame(last_java_sp, last_java_fp, nullptr, scratch);
2546 }
2547
2548 void MacroAssembler::shlptr(Register dst, int imm8) {
2549 shlq(dst, imm8);
2550 }
2551
2552 void MacroAssembler::shrptr(Register dst, int imm8) {
2553 shrq(dst, imm8);
2554 }
2555
2556 void MacroAssembler::sign_extend_byte(Register reg) {
2557 movsbl(reg, reg); // movsxb
2558 }
2559
2560 void MacroAssembler::sign_extend_short(Register reg) {
2561 movswl(reg, reg); // movsxw
2562 }
2563
2564 void MacroAssembler::narrow_subword_type(Register reg, BasicType bt) {
2565 assert(is_subword_type(bt), "required");
2566 switch (bt) {
2567 case T_BOOLEAN: andl(reg, 1); break;
2568 case T_BYTE: movsbl(reg, reg); break;
2569 case T_CHAR: movzwl(reg, reg); break;
2570 case T_SHORT: movswl(reg, reg); break;
2571 default: ShouldNotReachHere();
2572 }
2573 }
2574
2575 void MacroAssembler::testl(Address dst, int32_t imm32) {
2576 if (imm32 >= 0 && is8bit(imm32)) {
2577 testb(dst, imm32);
2578 } else {
2579 Assembler::testl(dst, imm32);
2580 }
2581 }
2582
2583 void MacroAssembler::testl(Register dst, int32_t imm32) {
2584 if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
2585 testb(dst, imm32);
2586 } else {
2587 Assembler::testl(dst, imm32);
2588 }
2589 }
2590
2591 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2592 assert(always_reachable(src), "Address should be reachable");
2593 testl(dst, as_Address(src));
2594 }
2595
2596 void MacroAssembler::testq(Address dst, int32_t imm32) {
2597 if (imm32 >= 0) {
2598 testl(dst, imm32);
2599 } else {
2600 Assembler::testq(dst, imm32);
2601 }
2602 }
2603
2604 void MacroAssembler::testq(Register dst, int32_t imm32) {
2605 if (imm32 >= 0) {
2606 testl(dst, imm32);
2607 } else {
2608 Assembler::testq(dst, imm32);
2609 }
2610 }
2611
2612 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2613 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2614 Assembler::pcmpeqb(dst, src);
2615 }
2616
2617 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2618 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2619 Assembler::pcmpeqw(dst, src);
2620 }
2621
2622 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2623 assert((dst->encoding() < 16),"XMM register should be 0-15");
2624 Assembler::pcmpestri(dst, src, imm8);
2625 }
2626
2627 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2628 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2629 Assembler::pcmpestri(dst, src, imm8);
2630 }
2631
2632 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2633 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2634 Assembler::pmovzxbw(dst, src);
2635 }
2636
2637 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2638 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2639 Assembler::pmovzxbw(dst, src);
2640 }
2641
2642 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2643 assert((src->encoding() < 16),"XMM register should be 0-15");
2644 Assembler::pmovmskb(dst, src);
2645 }
2646
2647 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2648 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2649 Assembler::ptest(dst, src);
2650 }
2651
2652 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2653 assert(rscratch != noreg || always_reachable(src), "missing");
2654
2655 if (reachable(src)) {
2656 Assembler::sqrtss(dst, as_Address(src));
2657 } else {
2658 lea(rscratch, src);
2659 Assembler::sqrtss(dst, Address(rscratch, 0));
2660 }
2661 }
2662
2663 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2664 assert(rscratch != noreg || always_reachable(src), "missing");
2665
2666 if (reachable(src)) {
2667 Assembler::subsd(dst, as_Address(src));
2668 } else {
2669 lea(rscratch, src);
2670 Assembler::subsd(dst, Address(rscratch, 0));
2671 }
2672 }
2673
2674 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
2675 assert(rscratch != noreg || always_reachable(src), "missing");
2676
2677 if (reachable(src)) {
2678 Assembler::roundsd(dst, as_Address(src), rmode);
2679 } else {
2680 lea(rscratch, src);
2681 Assembler::roundsd(dst, Address(rscratch, 0), rmode);
2682 }
2683 }
2684
2685 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2686 assert(rscratch != noreg || always_reachable(src), "missing");
2687
2688 if (reachable(src)) {
2689 Assembler::subss(dst, as_Address(src));
2690 } else {
2691 lea(rscratch, src);
2692 Assembler::subss(dst, Address(rscratch, 0));
2693 }
2694 }
2695
2696 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2697 assert(rscratch != noreg || always_reachable(src), "missing");
2698
2699 if (reachable(src)) {
2700 Assembler::ucomisd(dst, as_Address(src));
2701 } else {
2702 lea(rscratch, src);
2703 Assembler::ucomisd(dst, Address(rscratch, 0));
2704 }
2705 }
2706
2707 void MacroAssembler::evucomxsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2708 assert(rscratch != noreg || always_reachable(src), "missing");
2709
2710 if (reachable(src)) {
2711 Assembler::evucomxsd(dst, as_Address(src));
2712 } else {
2713 lea(rscratch, src);
2714 Assembler::evucomxsd(dst, Address(rscratch, 0));
2715 }
2716 }
2717
2718 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2719 assert(rscratch != noreg || always_reachable(src), "missing");
2720
2721 if (reachable(src)) {
2722 Assembler::ucomiss(dst, as_Address(src));
2723 } else {
2724 lea(rscratch, src);
2725 Assembler::ucomiss(dst, Address(rscratch, 0));
2726 }
2727 }
2728
2729 void MacroAssembler::evucomxss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2730 assert(rscratch != noreg || always_reachable(src), "missing");
2731
2732 if (reachable(src)) {
2733 Assembler::evucomxss(dst, as_Address(src));
2734 } else {
2735 lea(rscratch, src);
2736 Assembler::evucomxss(dst, Address(rscratch, 0));
2737 }
2738 }
2739
2740 void MacroAssembler::evucomish(XMMRegister dst, AddressLiteral src, Register rscratch) {
2741 assert(rscratch != noreg || always_reachable(src), "missing");
2742
2743 if (reachable(src)) {
2744 Assembler::evucomish(dst, as_Address(src));
2745 } else {
2746 lea(rscratch, src);
2747 Assembler::evucomish(dst, Address(rscratch, 0));
2748 }
2749 }
2750
2751 void MacroAssembler::evucomxsh(XMMRegister dst, AddressLiteral src, Register rscratch) {
2752 assert(rscratch != noreg || always_reachable(src), "missing");
2753
2754 if (reachable(src)) {
2755 Assembler::evucomxsh(dst, as_Address(src));
2756 } else {
2757 lea(rscratch, src);
2758 Assembler::evucomxsh(dst, Address(rscratch, 0));
2759 }
2760 }
2761
2762 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2763 assert(rscratch != noreg || always_reachable(src), "missing");
2764
2765 // Used in sign-bit flipping with aligned address.
2766 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2767
2768 if (UseAVX > 2 &&
2769 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2770 (dst->encoding() >= 16)) {
2771 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2772 } else if (reachable(src)) {
2773 Assembler::xorpd(dst, as_Address(src));
2774 } else {
2775 lea(rscratch, src);
2776 Assembler::xorpd(dst, Address(rscratch, 0));
2777 }
2778 }
2779
2780 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
2781 if (UseAVX > 2 &&
2782 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2783 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2784 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2785 } else {
2786 Assembler::xorpd(dst, src);
2787 }
2788 }
2789
2790 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
2791 if (UseAVX > 2 &&
2792 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2793 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2794 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2795 } else {
2796 Assembler::xorps(dst, src);
2797 }
2798 }
2799
2800 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
2801 assert(rscratch != noreg || always_reachable(src), "missing");
2802
2803 // Used in sign-bit flipping with aligned address.
2804 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2805
2806 if (UseAVX > 2 &&
2807 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2808 (dst->encoding() >= 16)) {
2809 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2810 } else if (reachable(src)) {
2811 Assembler::xorps(dst, as_Address(src));
2812 } else {
2813 lea(rscratch, src);
2814 Assembler::xorps(dst, Address(rscratch, 0));
2815 }
2816 }
2817
2818 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
2819 assert(rscratch != noreg || always_reachable(src), "missing");
2820
2821 // Used in sign-bit flipping with aligned address.
2822 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
2823 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
2824 if (reachable(src)) {
2825 Assembler::pshufb(dst, as_Address(src));
2826 } else {
2827 lea(rscratch, src);
2828 Assembler::pshufb(dst, Address(rscratch, 0));
2829 }
2830 }
2831
2832 // AVX 3-operands instructions
2833
2834 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2835 assert(rscratch != noreg || always_reachable(src), "missing");
2836
2837 if (reachable(src)) {
2838 vaddsd(dst, nds, as_Address(src));
2839 } else {
2840 lea(rscratch, src);
2841 vaddsd(dst, nds, Address(rscratch, 0));
2842 }
2843 }
2844
2845 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2846 assert(rscratch != noreg || always_reachable(src), "missing");
2847
2848 if (reachable(src)) {
2849 vaddss(dst, nds, as_Address(src));
2850 } else {
2851 lea(rscratch, src);
2852 vaddss(dst, nds, Address(rscratch, 0));
2853 }
2854 }
2855
2856 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2857 assert(UseAVX > 0, "requires some form of AVX");
2858 assert(rscratch != noreg || always_reachable(src), "missing");
2859
2860 if (reachable(src)) {
2861 Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
2862 } else {
2863 lea(rscratch, src);
2864 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
2865 }
2866 }
2867
2868 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2869 assert(UseAVX > 0, "requires some form of AVX");
2870 assert(rscratch != noreg || always_reachable(src), "missing");
2871
2872 if (reachable(src)) {
2873 Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
2874 } else {
2875 lea(rscratch, src);
2876 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
2877 }
2878 }
2879
2880 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2881 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2882 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2883
2884 vandps(dst, nds, negate_field, vector_len, rscratch);
2885 }
2886
2887 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2888 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2889 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2890
2891 vandpd(dst, nds, negate_field, vector_len, rscratch);
2892 }
2893
2894 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2895 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2896 Assembler::vpaddb(dst, nds, src, vector_len);
2897 }
2898
2899 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2900 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2901 Assembler::vpaddb(dst, nds, src, vector_len);
2902 }
2903
2904 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2905 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2906 Assembler::vpaddw(dst, nds, src, vector_len);
2907 }
2908
2909 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2910 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2911 Assembler::vpaddw(dst, nds, src, vector_len);
2912 }
2913
2914 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2915 assert(rscratch != noreg || always_reachable(src), "missing");
2916
2917 if (reachable(src)) {
2918 Assembler::vpand(dst, nds, as_Address(src), vector_len);
2919 } else {
2920 lea(rscratch, src);
2921 Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
2922 }
2923 }
2924
2925 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2926 assert(rscratch != noreg || always_reachable(src), "missing");
2927
2928 if (reachable(src)) {
2929 Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
2930 } else {
2931 lea(rscratch, src);
2932 Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
2933 }
2934 }
2935
2936 void MacroAssembler::vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2937 assert(rscratch != noreg || always_reachable(src), "missing");
2938
2939 if (reachable(src)) {
2940 Assembler::vbroadcasti128(dst, as_Address(src), vector_len);
2941 } else {
2942 lea(rscratch, src);
2943 Assembler::vbroadcasti128(dst, Address(rscratch, 0), vector_len);
2944 }
2945 }
2946
2947 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2948 assert(rscratch != noreg || always_reachable(src), "missing");
2949
2950 if (reachable(src)) {
2951 Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
2952 } else {
2953 lea(rscratch, src);
2954 Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
2955 }
2956 }
2957
2958 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2959 assert(rscratch != noreg || always_reachable(src), "missing");
2960
2961 if (reachable(src)) {
2962 Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
2963 } else {
2964 lea(rscratch, src);
2965 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
2966 }
2967 }
2968
2969 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2970 assert(rscratch != noreg || always_reachable(src), "missing");
2971
2972 if (reachable(src)) {
2973 Assembler::vbroadcastss(dst, as_Address(src), vector_len);
2974 } else {
2975 lea(rscratch, src);
2976 Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
2977 }
2978 }
2979
2980 // Vector float blend
2981 // vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
2982 void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
2983 // WARN: Allow dst == (src1|src2), mask == scratch
2984 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
2985 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
2986 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst;
2987 bool dst_available = dst != mask && (dst != src1 || dst != src2);
2988 if (blend_emulation && scratch_available && dst_available) {
2989 if (compute_mask) {
2990 vpsrad(scratch, mask, 32, vector_len);
2991 mask = scratch;
2992 }
2993 if (dst == src1) {
2994 vpandn(dst, mask, src1, vector_len); // if mask == 0, src1
2995 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
2996 } else {
2997 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
2998 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1
2999 }
3000 vpor(dst, dst, scratch, vector_len);
3001 } else {
3002 Assembler::vblendvps(dst, src1, src2, mask, vector_len);
3003 }
3004 }
3005
3006 // vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
3007 void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
3008 // WARN: Allow dst == (src1|src2), mask == scratch
3009 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
3010 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
3011 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask);
3012 bool dst_available = dst != mask && (dst != src1 || dst != src2);
3013 if (blend_emulation && scratch_available && dst_available) {
3014 if (compute_mask) {
3015 vpxor(scratch, scratch, scratch, vector_len);
3016 vpcmpgtq(scratch, scratch, mask, vector_len);
3017 mask = scratch;
3018 }
3019 if (dst == src1) {
3020 vpandn(dst, mask, src1, vector_len); // if mask == 0, src
3021 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3022 } else {
3023 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
3024 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src
3025 }
3026 vpor(dst, dst, scratch, vector_len);
3027 } else {
3028 Assembler::vblendvpd(dst, src1, src2, mask, vector_len);
3029 }
3030 }
3031
3032 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3033 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3034 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3035 }
3036
3037 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
3038 assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3039 Assembler::vpcmpeqb(dst, src1, src2, vector_len);
3040 }
3041
3042 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3043 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3044 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3045 }
3046
3047 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3048 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3049 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3050 }
3051
3052 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3053 assert(rscratch != noreg || always_reachable(src), "missing");
3054
3055 if (reachable(src)) {
3056 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3057 } else {
3058 lea(rscratch, src);
3059 Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3060 }
3061 }
3062
3063 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3064 int comparison, bool is_signed, int vector_len, Register rscratch) {
3065 assert(rscratch != noreg || always_reachable(src), "missing");
3066
3067 if (reachable(src)) {
3068 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3069 } else {
3070 lea(rscratch, src);
3071 Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3072 }
3073 }
3074
3075 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3076 int comparison, bool is_signed, int vector_len, Register rscratch) {
3077 assert(rscratch != noreg || always_reachable(src), "missing");
3078
3079 if (reachable(src)) {
3080 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3081 } else {
3082 lea(rscratch, src);
3083 Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3084 }
3085 }
3086
3087 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3088 int comparison, bool is_signed, int vector_len, Register rscratch) {
3089 assert(rscratch != noreg || always_reachable(src), "missing");
3090
3091 if (reachable(src)) {
3092 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3093 } else {
3094 lea(rscratch, src);
3095 Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3096 }
3097 }
3098
3099 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3100 int comparison, bool is_signed, int vector_len, Register rscratch) {
3101 assert(rscratch != noreg || always_reachable(src), "missing");
3102
3103 if (reachable(src)) {
3104 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3105 } else {
3106 lea(rscratch, src);
3107 Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3108 }
3109 }
3110
3111 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3112 if (width == Assembler::Q) {
3113 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3114 } else {
3115 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3116 }
3117 }
3118
3119 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3120 int eq_cond_enc = 0x29;
3121 int gt_cond_enc = 0x37;
3122 if (width != Assembler::Q) {
3123 eq_cond_enc = 0x74 + width;
3124 gt_cond_enc = 0x64 + width;
3125 }
3126 switch (cond) {
3127 case eq:
3128 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3129 break;
3130 case neq:
3131 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3132 vallones(xtmp, vector_len);
3133 vpxor(dst, xtmp, dst, vector_len);
3134 break;
3135 case le:
3136 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3137 vallones(xtmp, vector_len);
3138 vpxor(dst, xtmp, dst, vector_len);
3139 break;
3140 case nlt:
3141 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3142 vallones(xtmp, vector_len);
3143 vpxor(dst, xtmp, dst, vector_len);
3144 break;
3145 case lt:
3146 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3147 break;
3148 case nle:
3149 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3150 break;
3151 default:
3152 assert(false, "Should not reach here");
3153 }
3154 }
3155
3156 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3157 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3158 Assembler::vpmovzxbw(dst, src, vector_len);
3159 }
3160
3161 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3162 assert((src->encoding() < 16),"XMM register should be 0-15");
3163 Assembler::vpmovmskb(dst, src, vector_len);
3164 }
3165
3166 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3167 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3168 Assembler::vpmullw(dst, nds, src, vector_len);
3169 }
3170
3171 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3172 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3173 Assembler::vpmullw(dst, nds, src, vector_len);
3174 }
3175
3176 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3177 assert((UseAVX > 0), "AVX support is needed");
3178 assert(rscratch != noreg || always_reachable(src), "missing");
3179
3180 if (reachable(src)) {
3181 Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3182 } else {
3183 lea(rscratch, src);
3184 Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3185 }
3186 }
3187
3188 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3189 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3190 Assembler::vpsubb(dst, nds, src, vector_len);
3191 }
3192
3193 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3194 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3195 Assembler::vpsubb(dst, nds, src, vector_len);
3196 }
3197
3198 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3199 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3200 Assembler::vpsubw(dst, nds, src, vector_len);
3201 }
3202
3203 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3204 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3205 Assembler::vpsubw(dst, nds, src, vector_len);
3206 }
3207
3208 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3209 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3210 Assembler::vpsraw(dst, nds, shift, vector_len);
3211 }
3212
3213 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3214 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3215 Assembler::vpsraw(dst, nds, shift, vector_len);
3216 }
3217
3218 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3219 assert(UseAVX > 2,"");
3220 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3221 vector_len = 2;
3222 }
3223 Assembler::evpsraq(dst, nds, shift, vector_len);
3224 }
3225
3226 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3227 assert(UseAVX > 2,"");
3228 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3229 vector_len = 2;
3230 }
3231 Assembler::evpsraq(dst, nds, shift, vector_len);
3232 }
3233
3234 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3235 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3236 Assembler::vpsrlw(dst, nds, shift, vector_len);
3237 }
3238
3239 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3240 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3241 Assembler::vpsrlw(dst, nds, shift, vector_len);
3242 }
3243
3244 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3245 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3246 Assembler::vpsllw(dst, nds, shift, vector_len);
3247 }
3248
3249 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3250 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3251 Assembler::vpsllw(dst, nds, shift, vector_len);
3252 }
3253
3254 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3255 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3256 Assembler::vptest(dst, src);
3257 }
3258
3259 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3260 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3261 Assembler::punpcklbw(dst, src);
3262 }
3263
3264 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3265 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3266 Assembler::pshufd(dst, src, mode);
3267 }
3268
3269 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3270 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3271 Assembler::pshuflw(dst, src, mode);
3272 }
3273
3274 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3275 assert(rscratch != noreg || always_reachable(src), "missing");
3276
3277 if (reachable(src)) {
3278 vandpd(dst, nds, as_Address(src), vector_len);
3279 } else {
3280 lea(rscratch, src);
3281 vandpd(dst, nds, Address(rscratch, 0), vector_len);
3282 }
3283 }
3284
3285 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3286 assert(rscratch != noreg || always_reachable(src), "missing");
3287
3288 if (reachable(src)) {
3289 vandps(dst, nds, as_Address(src), vector_len);
3290 } else {
3291 lea(rscratch, src);
3292 vandps(dst, nds, Address(rscratch, 0), vector_len);
3293 }
3294 }
3295
3296 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3297 bool merge, int vector_len, Register rscratch) {
3298 assert(rscratch != noreg || always_reachable(src), "missing");
3299
3300 if (reachable(src)) {
3301 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3302 } else {
3303 lea(rscratch, src);
3304 Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3305 }
3306 }
3307
3308 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3309 assert(rscratch != noreg || always_reachable(src), "missing");
3310
3311 if (reachable(src)) {
3312 vdivsd(dst, nds, as_Address(src));
3313 } else {
3314 lea(rscratch, src);
3315 vdivsd(dst, nds, Address(rscratch, 0));
3316 }
3317 }
3318
3319 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3320 assert(rscratch != noreg || always_reachable(src), "missing");
3321
3322 if (reachable(src)) {
3323 vdivss(dst, nds, as_Address(src));
3324 } else {
3325 lea(rscratch, src);
3326 vdivss(dst, nds, Address(rscratch, 0));
3327 }
3328 }
3329
3330 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3331 assert(rscratch != noreg || always_reachable(src), "missing");
3332
3333 if (reachable(src)) {
3334 vmulsd(dst, nds, as_Address(src));
3335 } else {
3336 lea(rscratch, src);
3337 vmulsd(dst, nds, Address(rscratch, 0));
3338 }
3339 }
3340
3341 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3342 assert(rscratch != noreg || always_reachable(src), "missing");
3343
3344 if (reachable(src)) {
3345 vmulss(dst, nds, as_Address(src));
3346 } else {
3347 lea(rscratch, src);
3348 vmulss(dst, nds, Address(rscratch, 0));
3349 }
3350 }
3351
3352 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3353 assert(rscratch != noreg || always_reachable(src), "missing");
3354
3355 if (reachable(src)) {
3356 vsubsd(dst, nds, as_Address(src));
3357 } else {
3358 lea(rscratch, src);
3359 vsubsd(dst, nds, Address(rscratch, 0));
3360 }
3361 }
3362
3363 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3364 assert(rscratch != noreg || always_reachable(src), "missing");
3365
3366 if (reachable(src)) {
3367 vsubss(dst, nds, as_Address(src));
3368 } else {
3369 lea(rscratch, src);
3370 vsubss(dst, nds, Address(rscratch, 0));
3371 }
3372 }
3373
3374 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3375 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3376 assert(rscratch != noreg || always_reachable(src), "missing");
3377
3378 vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3379 }
3380
3381 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3382 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3383 assert(rscratch != noreg || always_reachable(src), "missing");
3384
3385 vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3386 }
3387
3388 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3389 assert(rscratch != noreg || always_reachable(src), "missing");
3390
3391 if (reachable(src)) {
3392 vxorpd(dst, nds, as_Address(src), vector_len);
3393 } else {
3394 lea(rscratch, src);
3395 vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3396 }
3397 }
3398
3399 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3400 assert(rscratch != noreg || always_reachable(src), "missing");
3401
3402 if (reachable(src)) {
3403 vxorps(dst, nds, as_Address(src), vector_len);
3404 } else {
3405 lea(rscratch, src);
3406 vxorps(dst, nds, Address(rscratch, 0), vector_len);
3407 }
3408 }
3409
3410 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3411 assert(rscratch != noreg || always_reachable(src), "missing");
3412
3413 if (UseAVX > 1 || (vector_len < 1)) {
3414 if (reachable(src)) {
3415 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3416 } else {
3417 lea(rscratch, src);
3418 Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3419 }
3420 } else {
3421 MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3422 }
3423 }
3424
3425 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3426 assert(rscratch != noreg || always_reachable(src), "missing");
3427
3428 if (reachable(src)) {
3429 Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3430 } else {
3431 lea(rscratch, src);
3432 Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3433 }
3434 }
3435
3436 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3437 const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3438 STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3439 // The inverted mask is sign-extended
3440 andptr(possibly_non_local, inverted_mask);
3441 }
3442
3443 void MacroAssembler::resolve_jobject(Register value,
3444 Register tmp) {
3445 Register thread = r15_thread;
3446 assert_different_registers(value, thread, tmp);
3447 Label done, tagged, weak_tagged;
3448 testptr(value, value);
3449 jcc(Assembler::zero, done); // Use null as-is.
3450 testptr(value, JNIHandles::tag_mask); // Test for tag.
3451 jcc(Assembler::notZero, tagged);
3452
3453 // Resolve local handle
3454 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp);
3455 verify_oop(value);
3456 jmp(done);
3457
3458 bind(tagged);
3459 testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3460 jcc(Assembler::notZero, weak_tagged);
3461
3462 // Resolve global handle
3463 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3464 verify_oop(value);
3465 jmp(done);
3466
3467 bind(weak_tagged);
3468 // Resolve jweak.
3469 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3470 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp);
3471 verify_oop(value);
3472
3473 bind(done);
3474 }
3475
3476 void MacroAssembler::resolve_global_jobject(Register value,
3477 Register tmp) {
3478 Register thread = r15_thread;
3479 assert_different_registers(value, thread, tmp);
3480 Label done;
3481
3482 testptr(value, value);
3483 jcc(Assembler::zero, done); // Use null as-is.
3484
3485 #ifdef ASSERT
3486 {
3487 Label valid_global_tag;
3488 testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
3489 jcc(Assembler::notZero, valid_global_tag);
3490 stop("non global jobject using resolve_global_jobject");
3491 bind(valid_global_tag);
3492 }
3493 #endif
3494
3495 // Resolve global handle
3496 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3497 verify_oop(value);
3498
3499 bind(done);
3500 }
3501
3502 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3503 subq(dst, imm32);
3504 }
3505
3506 // Force generation of a 4 byte immediate value even if it fits into 8bit
3507 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3508 subq_imm32(dst, imm32);
3509 }
3510
3511 void MacroAssembler::subptr(Register dst, Register src) {
3512 subq(dst, src);
3513 }
3514
3515 // C++ bool manipulation
3516 void MacroAssembler::testbool(Register dst) {
3517 if(sizeof(bool) == 1)
3518 testb(dst, 0xff);
3519 else if(sizeof(bool) == 2) {
3520 // testw implementation needed for two byte bools
3521 ShouldNotReachHere();
3522 } else if(sizeof(bool) == 4)
3523 testl(dst, dst);
3524 else
3525 // unsupported
3526 ShouldNotReachHere();
3527 }
3528
3529 void MacroAssembler::testptr(Register dst, Register src) {
3530 testq(dst, src);
3531 }
3532
3533 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3534 void MacroAssembler::tlab_allocate(Register obj,
3535 Register var_size_in_bytes,
3536 int con_size_in_bytes,
3537 Register t1,
3538 Register t2,
3539 Label& slow_case) {
3540 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3541 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3542 }
3543
3544 RegSet MacroAssembler::call_clobbered_gp_registers() {
3545 RegSet regs;
3546 regs += RegSet::of(rax, rcx, rdx);
3547 #ifndef _WINDOWS
3548 regs += RegSet::of(rsi, rdi);
3549 #endif
3550 regs += RegSet::range(r8, r11);
3551 if (UseAPX) {
3552 regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1));
3553 }
3554 return regs;
3555 }
3556
3557 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
3558 int num_xmm_registers = XMMRegister::available_xmm_registers();
3559 #if defined(_WINDOWS)
3560 XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
3561 if (num_xmm_registers > 16) {
3562 result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
3563 }
3564 return result;
3565 #else
3566 return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
3567 #endif
3568 }
3569
3570 // C1 only ever uses the first double/float of the XMM register.
3571 static int xmm_save_size() { return sizeof(double); }
3572
3573 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3574 masm->movdbl(Address(rsp, offset), reg);
3575 }
3576
3577 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3578 masm->movdbl(reg, Address(rsp, offset));
3579 }
3580
3581 static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers,
3582 bool save_fpu, int& gp_area_size, int& xmm_area_size) {
3583
3584 gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
3585 StackAlignmentInBytes);
3586 xmm_area_size = save_fpu ? xmm_registers.size() * xmm_save_size() : 0;
3587
3588 return gp_area_size + xmm_area_size;
3589 }
3590
3591 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
3592 block_comment("push_call_clobbered_registers start");
3593 // Regular registers
3594 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
3595
3596 int gp_area_size;
3597 int xmm_area_size;
3598 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
3599 gp_area_size, xmm_area_size);
3600 subptr(rsp, total_save_size);
3601
3602 push_set(gp_registers_to_push, 0);
3603
3604 if (save_fpu) {
3605 push_set(call_clobbered_xmm_registers(), gp_area_size);
3606 }
3607
3608 block_comment("push_call_clobbered_registers end");
3609 }
3610
3611 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
3612 block_comment("pop_call_clobbered_registers start");
3613
3614 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
3615
3616 int gp_area_size;
3617 int xmm_area_size;
3618 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
3619 gp_area_size, xmm_area_size);
3620
3621 if (restore_fpu) {
3622 pop_set(call_clobbered_xmm_registers(), gp_area_size);
3623 }
3624
3625 pop_set(gp_registers_to_pop, 0);
3626
3627 addptr(rsp, total_save_size);
3628
3629 vzeroupper();
3630
3631 block_comment("pop_call_clobbered_registers end");
3632 }
3633
3634 void MacroAssembler::push_set(XMMRegSet set, int offset) {
3635 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
3636 int spill_offset = offset;
3637
3638 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
3639 save_xmm_register(this, spill_offset, *it);
3640 spill_offset += xmm_save_size();
3641 }
3642 }
3643
3644 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
3645 int restore_size = set.size() * xmm_save_size();
3646 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
3647
3648 int restore_offset = offset + restore_size - xmm_save_size();
3649
3650 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
3651 restore_xmm_register(this, restore_offset, *it);
3652 restore_offset -= xmm_save_size();
3653 }
3654 }
3655
3656 void MacroAssembler::push_set(RegSet set, int offset) {
3657 int spill_offset;
3658 if (offset == -1) {
3659 int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3660 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
3661 subptr(rsp, aligned_size);
3662 spill_offset = 0;
3663 } else {
3664 spill_offset = offset;
3665 }
3666
3667 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
3668 movptr(Address(rsp, spill_offset), *it);
3669 spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3670 }
3671 }
3672
3673 void MacroAssembler::pop_set(RegSet set, int offset) {
3674
3675 int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3676 int restore_size = set.size() * gp_reg_size;
3677 int aligned_size = align_up(restore_size, StackAlignmentInBytes);
3678
3679 int restore_offset;
3680 if (offset == -1) {
3681 restore_offset = restore_size - gp_reg_size;
3682 } else {
3683 restore_offset = offset + restore_size - gp_reg_size;
3684 }
3685 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
3686 movptr(*it, Address(rsp, restore_offset));
3687 restore_offset -= gp_reg_size;
3688 }
3689
3690 if (offset == -1) {
3691 addptr(rsp, aligned_size);
3692 }
3693 }
3694
3695 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3696 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3697 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3698 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3699 Label done;
3700
3701 testptr(length_in_bytes, length_in_bytes);
3702 jcc(Assembler::zero, done);
3703
3704 // initialize topmost word, divide index by 2, check if odd and test if zero
3705 // note: for the remaining code to work, index must be a multiple of BytesPerWord
3706 #ifdef ASSERT
3707 {
3708 Label L;
3709 testptr(length_in_bytes, BytesPerWord - 1);
3710 jcc(Assembler::zero, L);
3711 stop("length must be a multiple of BytesPerWord");
3712 bind(L);
3713 }
3714 #endif
3715 Register index = length_in_bytes;
3716 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
3717 if (UseIncDec) {
3718 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
3719 } else {
3720 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
3721 shrptr(index, 1);
3722 }
3723
3724 // initialize remaining object fields: index is a multiple of 2 now
3725 {
3726 Label loop;
3727 bind(loop);
3728 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3729 decrement(index);
3730 jcc(Assembler::notZero, loop);
3731 }
3732
3733 bind(done);
3734 }
3735
3736 // Look up the method for a megamorphic invokeinterface call.
3737 // The target method is determined by <intf_klass, itable_index>.
3738 // The receiver klass is in recv_klass.
3739 // On success, the result will be in method_result, and execution falls through.
3740 // On failure, execution transfers to the given label.
3741 void MacroAssembler::lookup_interface_method(Register recv_klass,
3742 Register intf_klass,
3743 RegisterOrConstant itable_index,
3744 Register method_result,
3745 Register scan_temp,
3746 Label& L_no_such_interface,
3747 bool return_method) {
3748 assert_different_registers(recv_klass, intf_klass, scan_temp);
3749 assert_different_registers(method_result, intf_klass, scan_temp);
3750 assert(recv_klass != method_result || !return_method,
3751 "recv_klass can be destroyed when method isn't needed");
3752
3753 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3754 "caller must use same register for non-constant itable index as for method");
3755
3756 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3757 int vtable_base = in_bytes(Klass::vtable_start_offset());
3758 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3759 int scan_step = itableOffsetEntry::size() * wordSize;
3760 int vte_size = vtableEntry::size_in_bytes();
3761 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3762 assert(vte_size == wordSize, "else adjust times_vte_scale");
3763
3764 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3765
3766 // Could store the aligned, prescaled offset in the klass.
3767 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3768
3769 if (return_method) {
3770 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3771 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3772 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3773 }
3774
3775 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
3776 // if (scan->interface() == intf) {
3777 // result = (klass + scan->offset() + itable_index);
3778 // }
3779 // }
3780 Label search, found_method;
3781
3782 for (int peel = 1; peel >= 0; peel--) {
3783 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
3784 cmpptr(intf_klass, method_result);
3785
3786 if (peel) {
3787 jccb(Assembler::equal, found_method);
3788 } else {
3789 jccb(Assembler::notEqual, search);
3790 // (invert the test to fall through to found_method...)
3791 }
3792
3793 if (!peel) break;
3794
3795 bind(search);
3796
3797 // Check that the previous entry is non-null. A null entry means that
3798 // the receiver class doesn't implement the interface, and wasn't the
3799 // same as when the caller was compiled.
3800 testptr(method_result, method_result);
3801 jcc(Assembler::zero, L_no_such_interface);
3802 addptr(scan_temp, scan_step);
3803 }
3804
3805 bind(found_method);
3806
3807 if (return_method) {
3808 // Got a hit.
3809 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
3810 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3811 }
3812 }
3813
3814 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3815 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3816 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3817 // The target method is determined by <holder_klass, itable_index>.
3818 // The receiver klass is in recv_klass.
3819 // On success, the result will be in method_result, and execution falls through.
3820 // On failure, execution transfers to the given label.
3821 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3822 Register holder_klass,
3823 Register resolved_klass,
3824 Register method_result,
3825 Register scan_temp,
3826 Register temp_reg2,
3827 Register receiver,
3828 int itable_index,
3829 Label& L_no_such_interface) {
3830 assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
3831 Register temp_itbl_klass = method_result;
3832 Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
3833
3834 int vtable_base = in_bytes(Klass::vtable_start_offset());
3835 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3836 int scan_step = itableOffsetEntry::size() * wordSize;
3837 int vte_size = vtableEntry::size_in_bytes();
3838 int ioffset = in_bytes(itableOffsetEntry::interface_offset());
3839 int ooffset = in_bytes(itableOffsetEntry::offset_offset());
3840 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3841 assert(vte_size == wordSize, "adjust times_vte_scale");
3842
3843 Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
3844
3845 // temp_itbl_klass = recv_klass.itable[0]
3846 // scan_temp = &recv_klass.itable[0] + step
3847 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3848 movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
3849 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
3850 xorptr(temp_reg, temp_reg);
3851
3852 // Initial checks:
3853 // - if (holder_klass != resolved_klass), go to "scan for resolved"
3854 // - if (itable[0] == 0), no such interface
3855 // - if (itable[0] == holder_klass), shortcut to "holder found"
3856 cmpptr(holder_klass, resolved_klass);
3857 jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
3858 testptr(temp_itbl_klass, temp_itbl_klass);
3859 jccb(Assembler::zero, L_no_such_interface);
3860 cmpptr(holder_klass, temp_itbl_klass);
3861 jccb(Assembler::equal, L_holder_found);
3862
3863 // Loop: Look for holder_klass record in itable
3864 // do {
3865 // tmp = itable[index];
3866 // index += step;
3867 // if (tmp == holder_klass) {
3868 // goto L_holder_found; // Found!
3869 // }
3870 // } while (tmp != 0);
3871 // goto L_no_such_interface // Not found.
3872 Label L_scan_holder;
3873 bind(L_scan_holder);
3874 movptr(temp_itbl_klass, Address(scan_temp, 0));
3875 addptr(scan_temp, scan_step);
3876 cmpptr(holder_klass, temp_itbl_klass);
3877 jccb(Assembler::equal, L_holder_found);
3878 testptr(temp_itbl_klass, temp_itbl_klass);
3879 jccb(Assembler::notZero, L_scan_holder);
3880
3881 jmpb(L_no_such_interface);
3882
3883 // Loop: Look for resolved_class record in itable
3884 // do {
3885 // tmp = itable[index];
3886 // index += step;
3887 // if (tmp == holder_klass) {
3888 // // Also check if we have met a holder klass
3889 // holder_tmp = itable[index-step-ioffset];
3890 // }
3891 // if (tmp == resolved_klass) {
3892 // goto L_resolved_found; // Found!
3893 // }
3894 // } while (tmp != 0);
3895 // goto L_no_such_interface // Not found.
3896 //
3897 Label L_loop_scan_resolved;
3898 bind(L_loop_scan_resolved);
3899 movptr(temp_itbl_klass, Address(scan_temp, 0));
3900 addptr(scan_temp, scan_step);
3901 bind(L_loop_scan_resolved_entry);
3902 cmpptr(holder_klass, temp_itbl_klass);
3903 cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
3904 cmpptr(resolved_klass, temp_itbl_klass);
3905 jccb(Assembler::equal, L_resolved_found);
3906 testptr(temp_itbl_klass, temp_itbl_klass);
3907 jccb(Assembler::notZero, L_loop_scan_resolved);
3908
3909 jmpb(L_no_such_interface);
3910
3911 Label L_ready;
3912
3913 // See if we already have a holder klass. If not, go and scan for it.
3914 bind(L_resolved_found);
3915 testptr(temp_reg, temp_reg);
3916 jccb(Assembler::zero, L_scan_holder);
3917 jmpb(L_ready);
3918
3919 bind(L_holder_found);
3920 movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
3921
3922 // Finally, temp_reg contains holder_klass vtable offset
3923 bind(L_ready);
3924 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3925 if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
3926 load_klass(scan_temp, receiver, noreg);
3927 movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
3928 } else {
3929 movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
3930 }
3931 }
3932
3933
3934 // virtual method calling
3935 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3936 RegisterOrConstant vtable_index,
3937 Register method_result) {
3938 const ByteSize base = Klass::vtable_start_offset();
3939 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3940 Address vtable_entry_addr(recv_klass,
3941 vtable_index, Address::times_ptr,
3942 base + vtableEntry::method_offset());
3943 movptr(method_result, vtable_entry_addr);
3944 }
3945
3946
3947 void MacroAssembler::check_klass_subtype(Register sub_klass,
3948 Register super_klass,
3949 Register temp_reg,
3950 Label& L_success) {
3951 Label L_failure;
3952 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, nullptr);
3953 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
3954 bind(L_failure);
3955 }
3956
3957
3958 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3959 Register super_klass,
3960 Register temp_reg,
3961 Label* L_success,
3962 Label* L_failure,
3963 Label* L_slow_path,
3964 RegisterOrConstant super_check_offset) {
3965 assert_different_registers(sub_klass, super_klass, temp_reg);
3966 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3967 if (super_check_offset.is_register()) {
3968 assert_different_registers(sub_klass, super_klass,
3969 super_check_offset.as_register());
3970 } else if (must_load_sco) {
3971 assert(temp_reg != noreg, "supply either a temp or a register offset");
3972 }
3973
3974 Label L_fallthrough;
3975 int label_nulls = 0;
3976 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
3977 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
3978 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
3979 assert(label_nulls <= 1, "at most one null in the batch");
3980
3981 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3982 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3983 Address super_check_offset_addr(super_klass, sco_offset);
3984
3985 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3986 // range of a jccb. If this routine grows larger, reconsider at
3987 // least some of these.
3988 #define local_jcc(assembler_cond, label) \
3989 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
3990 else jcc( assembler_cond, label) /*omit semi*/
3991
3992 // Hacked jmp, which may only be used just before L_fallthrough.
3993 #define final_jmp(label) \
3994 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
3995 else jmp(label) /*omit semi*/
3996
3997 // If the pointers are equal, we are done (e.g., String[] elements).
3998 // This self-check enables sharing of secondary supertype arrays among
3999 // non-primary types such as array-of-interface. Otherwise, each such
4000 // type would need its own customized SSA.
4001 // We move this check to the front of the fast path because many
4002 // type checks are in fact trivially successful in this manner,
4003 // so we get a nicely predicted branch right at the start of the check.
4004 cmpptr(sub_klass, super_klass);
4005 local_jcc(Assembler::equal, *L_success);
4006
4007 // Check the supertype display:
4008 if (must_load_sco) {
4009 // Positive movl does right thing on LP64.
4010 movl(temp_reg, super_check_offset_addr);
4011 super_check_offset = RegisterOrConstant(temp_reg);
4012 }
4013 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4014 cmpptr(super_klass, super_check_addr); // load displayed supertype
4015
4016 // This check has worked decisively for primary supers.
4017 // Secondary supers are sought in the super_cache ('super_cache_addr').
4018 // (Secondary supers are interfaces and very deeply nested subtypes.)
4019 // This works in the same check above because of a tricky aliasing
4020 // between the super_cache and the primary super display elements.
4021 // (The 'super_check_addr' can address either, as the case requires.)
4022 // Note that the cache is updated below if it does not help us find
4023 // what we need immediately.
4024 // So if it was a primary super, we can just fail immediately.
4025 // Otherwise, it's the slow path for us (no success at this point).
4026
4027 if (super_check_offset.is_register()) {
4028 local_jcc(Assembler::equal, *L_success);
4029 cmpl(super_check_offset.as_register(), sc_offset);
4030 if (L_failure == &L_fallthrough) {
4031 local_jcc(Assembler::equal, *L_slow_path);
4032 } else {
4033 local_jcc(Assembler::notEqual, *L_failure);
4034 final_jmp(*L_slow_path);
4035 }
4036 } else if (super_check_offset.as_constant() == sc_offset) {
4037 // Need a slow path; fast failure is impossible.
4038 if (L_slow_path == &L_fallthrough) {
4039 local_jcc(Assembler::equal, *L_success);
4040 } else {
4041 local_jcc(Assembler::notEqual, *L_slow_path);
4042 final_jmp(*L_success);
4043 }
4044 } else {
4045 // No slow path; it's a fast decision.
4046 if (L_failure == &L_fallthrough) {
4047 local_jcc(Assembler::equal, *L_success);
4048 } else {
4049 local_jcc(Assembler::notEqual, *L_failure);
4050 final_jmp(*L_success);
4051 }
4052 }
4053
4054 bind(L_fallthrough);
4055
4056 #undef local_jcc
4057 #undef final_jmp
4058 }
4059
4060
4061 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4062 Register super_klass,
4063 Register temp_reg,
4064 Register temp2_reg,
4065 Label* L_success,
4066 Label* L_failure,
4067 bool set_cond_codes) {
4068 assert_different_registers(sub_klass, super_klass, temp_reg);
4069 if (temp2_reg != noreg)
4070 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4071 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4072
4073 Label L_fallthrough;
4074 int label_nulls = 0;
4075 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4076 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4077 assert(label_nulls <= 1, "at most one null in the batch");
4078
4079 // a couple of useful fields in sub_klass:
4080 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4081 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4082 Address secondary_supers_addr(sub_klass, ss_offset);
4083 Address super_cache_addr( sub_klass, sc_offset);
4084
4085 // Do a linear scan of the secondary super-klass chain.
4086 // This code is rarely used, so simplicity is a virtue here.
4087 // The repne_scan instruction uses fixed registers, which we must spill.
4088 // Don't worry too much about pre-existing connections with the input regs.
4089
4090 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4091 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4092
4093 // Get super_klass value into rax (even if it was in rdi or rcx).
4094 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4095 if (super_klass != rax) {
4096 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4097 mov(rax, super_klass);
4098 }
4099 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4100 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4101
4102 #ifndef PRODUCT
4103 uint* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4104 ExternalAddress pst_counter_addr((address) pst_counter);
4105 lea(rcx, pst_counter_addr);
4106 incrementl(Address(rcx, 0));
4107 #endif //PRODUCT
4108
4109 // We will consult the secondary-super array.
4110 movptr(rdi, secondary_supers_addr);
4111 // Load the array length. (Positive movl does right thing on LP64.)
4112 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4113 // Skip to start of data.
4114 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4115
4116 // Scan RCX words at [RDI] for an occurrence of RAX.
4117 // Set NZ/Z based on last compare.
4118 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4119 // not change flags (only scas instruction which is repeated sets flags).
4120 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4121
4122 testptr(rax,rax); // Set Z = 0
4123 repne_scan();
4124
4125 // Unspill the temp. registers:
4126 if (pushed_rdi) pop(rdi);
4127 if (pushed_rcx) pop(rcx);
4128 if (pushed_rax) pop(rax);
4129
4130 if (set_cond_codes) {
4131 // Special hack for the AD files: rdi is guaranteed non-zero.
4132 assert(!pushed_rdi, "rdi must be left non-null");
4133 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4134 }
4135
4136 if (L_failure == &L_fallthrough)
4137 jccb(Assembler::notEqual, *L_failure);
4138 else jcc(Assembler::notEqual, *L_failure);
4139
4140 // Success. Cache the super we found and proceed in triumph.
4141 movptr(super_cache_addr, super_klass);
4142
4143 if (L_success != &L_fallthrough) {
4144 jmp(*L_success);
4145 }
4146
4147 #undef IS_A_TEMP
4148
4149 bind(L_fallthrough);
4150 }
4151
4152 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4153 Register super_klass,
4154 Register temp_reg,
4155 Register temp2_reg,
4156 Label* L_success,
4157 Label* L_failure,
4158 bool set_cond_codes) {
4159 assert(set_cond_codes == false, "must be false on 64-bit x86");
4160 check_klass_subtype_slow_path
4161 (sub_klass, super_klass, temp_reg, temp2_reg, noreg, noreg,
4162 L_success, L_failure);
4163 }
4164
4165 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4166 Register super_klass,
4167 Register temp_reg,
4168 Register temp2_reg,
4169 Register temp3_reg,
4170 Register temp4_reg,
4171 Label* L_success,
4172 Label* L_failure) {
4173 if (UseSecondarySupersTable) {
4174 check_klass_subtype_slow_path_table
4175 (sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, temp4_reg,
4176 L_success, L_failure);
4177 } else {
4178 check_klass_subtype_slow_path_linear
4179 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, /*set_cond_codes*/false);
4180 }
4181 }
4182
4183 Register MacroAssembler::allocate_if_noreg(Register r,
4184 RegSetIterator<Register> &available_regs,
4185 RegSet ®s_to_push) {
4186 if (!r->is_valid()) {
4187 r = *available_regs++;
4188 regs_to_push += r;
4189 }
4190 return r;
4191 }
4192
4193 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4194 Register super_klass,
4195 Register temp_reg,
4196 Register temp2_reg,
4197 Register temp3_reg,
4198 Register result_reg,
4199 Label* L_success,
4200 Label* L_failure) {
4201 // NB! Callers may assume that, when temp2_reg is a valid register,
4202 // this code sets it to a nonzero value.
4203 bool temp2_reg_was_valid = temp2_reg->is_valid();
4204
4205 RegSet temps = RegSet::of(temp_reg, temp2_reg, temp3_reg);
4206
4207 Label L_fallthrough;
4208 int label_nulls = 0;
4209 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4210 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4211 assert(label_nulls <= 1, "at most one null in the batch");
4212
4213 BLOCK_COMMENT("check_klass_subtype_slow_path_table");
4214
4215 RegSetIterator<Register> available_regs
4216 = (RegSet::of(rax, rcx, rdx, r8) + r9 + r10 + r11 + r12 - temps - sub_klass - super_klass).begin();
4217
4218 RegSet pushed_regs;
4219
4220 temp_reg = allocate_if_noreg(temp_reg, available_regs, pushed_regs);
4221 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
4222 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
4223 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4224 Register temp4_reg = allocate_if_noreg(noreg, available_regs, pushed_regs);
4225
4226 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, result_reg);
4227
4228 {
4229
4230 int register_push_size = pushed_regs.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4231 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4232 subptr(rsp, aligned_size);
4233 push_set(pushed_regs, 0);
4234
4235 lookup_secondary_supers_table_var(sub_klass,
4236 super_klass,
4237 temp_reg, temp2_reg, temp3_reg, temp4_reg, result_reg);
4238 cmpq(result_reg, 0);
4239
4240 // Unspill the temp. registers:
4241 pop_set(pushed_regs, 0);
4242 // Increment SP but do not clobber flags.
4243 lea(rsp, Address(rsp, aligned_size));
4244 }
4245
4246 if (temp2_reg_was_valid) {
4247 movq(temp2_reg, 1);
4248 }
4249
4250 jcc(Assembler::notEqual, *L_failure);
4251
4252 if (L_success != &L_fallthrough) {
4253 jmp(*L_success);
4254 }
4255
4256 bind(L_fallthrough);
4257 }
4258
4259 // population_count variant for running without the POPCNT
4260 // instruction, which was introduced with SSE4.2 in 2008.
4261 void MacroAssembler::population_count(Register dst, Register src,
4262 Register scratch1, Register scratch2) {
4263 assert_different_registers(src, scratch1, scratch2);
4264 if (UsePopCountInstruction) {
4265 Assembler::popcntq(dst, src);
4266 } else {
4267 assert_different_registers(src, scratch1, scratch2);
4268 assert_different_registers(dst, scratch1, scratch2);
4269 Label loop, done;
4270
4271 mov(scratch1, src);
4272 // dst = 0;
4273 // while(scratch1 != 0) {
4274 // dst++;
4275 // scratch1 &= (scratch1 - 1);
4276 // }
4277 xorl(dst, dst);
4278 testq(scratch1, scratch1);
4279 jccb(Assembler::equal, done);
4280 {
4281 bind(loop);
4282 incq(dst);
4283 movq(scratch2, scratch1);
4284 decq(scratch2);
4285 andq(scratch1, scratch2);
4286 jccb(Assembler::notEqual, loop);
4287 }
4288 bind(done);
4289 }
4290 #ifdef ASSERT
4291 mov64(scratch1, 0xCafeBabeDeadBeef);
4292 movq(scratch2, scratch1);
4293 #endif
4294 }
4295
4296 // Ensure that the inline code and the stub are using the same registers.
4297 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
4298 do { \
4299 assert(r_super_klass == rax, "mismatch"); \
4300 assert(r_array_base == rbx, "mismatch"); \
4301 assert(r_array_length == rcx, "mismatch"); \
4302 assert(r_array_index == rdx, "mismatch"); \
4303 assert(r_sub_klass == rsi || r_sub_klass == noreg, "mismatch"); \
4304 assert(r_bitmap == r11 || r_bitmap == noreg, "mismatch"); \
4305 assert(result == rdi || result == noreg, "mismatch"); \
4306 } while(0)
4307
4308 // Versions of salq and rorq that don't need count to be in rcx
4309
4310 void MacroAssembler::salq(Register dest, Register count) {
4311 if (count == rcx) {
4312 Assembler::salq(dest);
4313 } else {
4314 assert_different_registers(rcx, dest);
4315 xchgq(rcx, count);
4316 Assembler::salq(dest);
4317 xchgq(rcx, count);
4318 }
4319 }
4320
4321 void MacroAssembler::rorq(Register dest, Register count) {
4322 if (count == rcx) {
4323 Assembler::rorq(dest);
4324 } else {
4325 assert_different_registers(rcx, dest);
4326 xchgq(rcx, count);
4327 Assembler::rorq(dest);
4328 xchgq(rcx, count);
4329 }
4330 }
4331
4332 // Return true: we succeeded in generating this code
4333 //
4334 // At runtime, return 0 in result if r_super_klass is a superclass of
4335 // r_sub_klass, otherwise return nonzero. Use this if you know the
4336 // super_klass_slot of the class you're looking for. This is always
4337 // the case for instanceof and checkcast.
4338 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4339 Register r_super_klass,
4340 Register temp1,
4341 Register temp2,
4342 Register temp3,
4343 Register temp4,
4344 Register result,
4345 u1 super_klass_slot) {
4346 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4347
4348 Label L_fallthrough, L_success, L_failure;
4349
4350 BLOCK_COMMENT("lookup_secondary_supers_table {");
4351
4352 const Register
4353 r_array_index = temp1,
4354 r_array_length = temp2,
4355 r_array_base = temp3,
4356 r_bitmap = temp4;
4357
4358 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4359
4360 xorq(result, result); // = 0
4361
4362 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4363 movq(r_array_index, r_bitmap);
4364
4365 // First check the bitmap to see if super_klass might be present. If
4366 // the bit is zero, we are certain that super_klass is not one of
4367 // the secondary supers.
4368 u1 bit = super_klass_slot;
4369 {
4370 // NB: If the count in a x86 shift instruction is 0, the flags are
4371 // not affected, so we do a testq instead.
4372 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
4373 if (shift_count != 0) {
4374 salq(r_array_index, shift_count);
4375 } else {
4376 testq(r_array_index, r_array_index);
4377 }
4378 }
4379 // We test the MSB of r_array_index, i.e. its sign bit
4380 jcc(Assembler::positive, L_failure);
4381
4382 // Get the first array index that can contain super_klass into r_array_index.
4383 if (bit != 0) {
4384 population_count(r_array_index, r_array_index, temp2, temp3);
4385 } else {
4386 movl(r_array_index, 1);
4387 }
4388 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4389
4390 // We will consult the secondary-super array.
4391 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4392
4393 // We're asserting that the first word in an Array<Klass*> is the
4394 // length, and the second word is the first word of the data. If
4395 // that ever changes, r_array_base will have to be adjusted here.
4396 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4397 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4398
4399 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4400 jccb(Assembler::equal, L_success);
4401
4402 // Is there another entry to check? Consult the bitmap.
4403 btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4404 jccb(Assembler::carryClear, L_failure);
4405
4406 // Linear probe. Rotate the bitmap so that the next bit to test is
4407 // in Bit 1.
4408 if (bit != 0) {
4409 rorq(r_bitmap, bit);
4410 }
4411
4412 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4413 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4414 // Kills: r_array_length.
4415 // Returns: result.
4416 call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub()));
4417 // Result (0/1) is in rdi
4418 jmpb(L_fallthrough);
4419
4420 bind(L_failure);
4421 incq(result); // 0 => 1
4422
4423 bind(L_success);
4424 // result = 0;
4425
4426 bind(L_fallthrough);
4427 BLOCK_COMMENT("} lookup_secondary_supers_table");
4428
4429 if (VerifySecondarySupers) {
4430 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4431 temp1, temp2, temp3);
4432 }
4433 }
4434
4435 // At runtime, return 0 in result if r_super_klass is a superclass of
4436 // r_sub_klass, otherwise return nonzero. Use this version of
4437 // lookup_secondary_supers_table() if you don't know ahead of time
4438 // which superclass will be searched for. Used by interpreter and
4439 // runtime stubs. It is larger and has somewhat greater latency than
4440 // the version above, which takes a constant super_klass_slot.
4441 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4442 Register r_super_klass,
4443 Register temp1,
4444 Register temp2,
4445 Register temp3,
4446 Register temp4,
4447 Register result) {
4448 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4449 assert_different_registers(r_sub_klass, r_super_klass, rcx);
4450 RegSet temps = RegSet::of(temp1, temp2, temp3, temp4);
4451
4452 Label L_fallthrough, L_success, L_failure;
4453
4454 BLOCK_COMMENT("lookup_secondary_supers_table {");
4455
4456 RegSetIterator<Register> available_regs = (temps - rcx).begin();
4457
4458 // FIXME. Once we are sure that all paths reaching this point really
4459 // do pass rcx as one of our temps we can get rid of the following
4460 // workaround.
4461 assert(temps.contains(rcx), "fix this code");
4462
4463 // We prefer to have our shift count in rcx. If rcx is one of our
4464 // temps, use it for slot. If not, pick any of our temps.
4465 Register slot;
4466 if (!temps.contains(rcx)) {
4467 slot = *available_regs++;
4468 } else {
4469 slot = rcx;
4470 }
4471
4472 const Register r_array_index = *available_regs++;
4473 const Register r_bitmap = *available_regs++;
4474
4475 // The logic above guarantees this property, but we state it here.
4476 assert_different_registers(r_array_index, r_bitmap, rcx);
4477
4478 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4479 movq(r_array_index, r_bitmap);
4480
4481 // First check the bitmap to see if super_klass might be present. If
4482 // the bit is zero, we are certain that super_klass is not one of
4483 // the secondary supers.
4484 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4485 xorl(slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1)); // slot ^ 63 === 63 - slot (mod 64)
4486 salq(r_array_index, slot);
4487
4488 testq(r_array_index, r_array_index);
4489 // We test the MSB of r_array_index, i.e. its sign bit
4490 jcc(Assembler::positive, L_failure);
4491
4492 const Register r_array_base = *available_regs++;
4493
4494 // Get the first array index that can contain super_klass into r_array_index.
4495 // Note: Clobbers r_array_base and slot.
4496 population_count(r_array_index, r_array_index, /*temp2*/r_array_base, /*temp3*/slot);
4497
4498 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4499
4500 // We will consult the secondary-super array.
4501 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4502
4503 // We're asserting that the first word in an Array<Klass*> is the
4504 // length, and the second word is the first word of the data. If
4505 // that ever changes, r_array_base will have to be adjusted here.
4506 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4507 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4508
4509 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4510 jccb(Assembler::equal, L_success);
4511
4512 // Restore slot to its true value
4513 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4514
4515 // Linear probe. Rotate the bitmap so that the next bit to test is
4516 // in Bit 1.
4517 rorq(r_bitmap, slot);
4518
4519 // Is there another entry to check? Consult the bitmap.
4520 btq(r_bitmap, 1);
4521 jccb(Assembler::carryClear, L_failure);
4522
4523 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4524 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4525 // Kills: r_array_length.
4526 // Returns: result.
4527 lookup_secondary_supers_table_slow_path(r_super_klass,
4528 r_array_base,
4529 r_array_index,
4530 r_bitmap,
4531 /*temp1*/result,
4532 /*temp2*/slot,
4533 &L_success,
4534 nullptr);
4535
4536 bind(L_failure);
4537 movq(result, 1);
4538 jmpb(L_fallthrough);
4539
4540 bind(L_success);
4541 xorq(result, result); // = 0
4542
4543 bind(L_fallthrough);
4544 BLOCK_COMMENT("} lookup_secondary_supers_table");
4545
4546 if (VerifySecondarySupers) {
4547 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4548 temp1, temp2, temp3);
4549 }
4550 }
4551
4552 void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit,
4553 Label* L_success, Label* L_failure) {
4554 Label L_loop, L_fallthrough;
4555 {
4556 int label_nulls = 0;
4557 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4558 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4559 assert(label_nulls <= 1, "at most one null in the batch");
4560 }
4561 bind(L_loop);
4562 cmpq(value, Address(addr, count, Address::times_8));
4563 jcc(Assembler::equal, *L_success);
4564 addl(count, 1);
4565 cmpl(count, limit);
4566 jcc(Assembler::less, L_loop);
4567
4568 if (&L_fallthrough != L_failure) {
4569 jmp(*L_failure);
4570 }
4571 bind(L_fallthrough);
4572 }
4573
4574 // Called by code generated by check_klass_subtype_slow_path
4575 // above. This is called when there is a collision in the hashed
4576 // lookup in the secondary supers array.
4577 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4578 Register r_array_base,
4579 Register r_array_index,
4580 Register r_bitmap,
4581 Register temp1,
4582 Register temp2,
4583 Label* L_success,
4584 Label* L_failure) {
4585 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2);
4586
4587 const Register
4588 r_array_length = temp1,
4589 r_sub_klass = noreg,
4590 result = noreg;
4591
4592 Label L_fallthrough;
4593 int label_nulls = 0;
4594 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4595 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4596 assert(label_nulls <= 1, "at most one null in the batch");
4597
4598 // Load the array length.
4599 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4600 // And adjust the array base to point to the data.
4601 // NB! Effectively increments current slot index by 1.
4602 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4603 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4604
4605 // Linear probe
4606 Label L_huge;
4607
4608 // The bitmap is full to bursting.
4609 // Implicit invariant: BITMAP_FULL implies (length > 0)
4610 cmpl(r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4611 jcc(Assembler::greater, L_huge);
4612
4613 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4614 // current slot (at secondary_supers[r_array_index]) has not yet
4615 // been inspected, and r_array_index may be out of bounds if we
4616 // wrapped around the end of the array.
4617
4618 { // This is conventional linear probing, but instead of terminating
4619 // when a null entry is found in the table, we maintain a bitmap
4620 // in which a 0 indicates missing entries.
4621 // The check above guarantees there are 0s in the bitmap, so the loop
4622 // eventually terminates.
4623
4624 xorl(temp2, temp2); // = 0;
4625
4626 Label L_again;
4627 bind(L_again);
4628
4629 // Check for array wraparound.
4630 cmpl(r_array_index, r_array_length);
4631 cmovl(Assembler::greaterEqual, r_array_index, temp2);
4632
4633 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4634 jcc(Assembler::equal, *L_success);
4635
4636 // If the next bit in bitmap is zero, we're done.
4637 btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now
4638 jcc(Assembler::carryClear, *L_failure);
4639
4640 rorq(r_bitmap, 1); // Bits 1/2 => 0/1
4641 addl(r_array_index, 1);
4642
4643 jmp(L_again);
4644 }
4645
4646 { // Degenerate case: more than 64 secondary supers.
4647 // FIXME: We could do something smarter here, maybe a vectorized
4648 // comparison or a binary search, but is that worth any added
4649 // complexity?
4650 bind(L_huge);
4651 xorl(r_array_index, r_array_index); // = 0
4652 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length,
4653 L_success,
4654 (&L_fallthrough != L_failure ? L_failure : nullptr));
4655
4656 bind(L_fallthrough);
4657 }
4658 }
4659
4660 struct VerifyHelperArguments {
4661 Klass* _super;
4662 Klass* _sub;
4663 intptr_t _linear_result;
4664 intptr_t _table_result;
4665 };
4666
4667 static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) {
4668 Klass::on_secondary_supers_verification_failure(args->_super,
4669 args->_sub,
4670 args->_linear_result,
4671 args->_table_result,
4672 msg);
4673 }
4674
4675 // Make sure that the hashed lookup and a linear scan agree.
4676 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4677 Register r_super_klass,
4678 Register result,
4679 Register temp1,
4680 Register temp2,
4681 Register temp3) {
4682 const Register
4683 r_array_index = temp1,
4684 r_array_length = temp2,
4685 r_array_base = temp3,
4686 r_bitmap = noreg;
4687
4688 BLOCK_COMMENT("verify_secondary_supers_table {");
4689
4690 Label L_success, L_failure, L_check, L_done;
4691
4692 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4693 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4694 // And adjust the array base to point to the data.
4695 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4696
4697 testl(r_array_length, r_array_length); // array_length == 0?
4698 jcc(Assembler::zero, L_failure);
4699
4700 movl(r_array_index, 0);
4701 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success);
4702 // fall through to L_failure
4703
4704 const Register linear_result = r_array_index; // reuse temp1
4705
4706 bind(L_failure); // not present
4707 movl(linear_result, 1);
4708 jmp(L_check);
4709
4710 bind(L_success); // present
4711 movl(linear_result, 0);
4712
4713 bind(L_check);
4714 cmpl(linear_result, result);
4715 jcc(Assembler::equal, L_done);
4716
4717 { // To avoid calling convention issues, build a record on the stack
4718 // and pass the pointer to that instead.
4719 push(result);
4720 push(linear_result);
4721 push(r_sub_klass);
4722 push(r_super_klass);
4723 movptr(c_rarg1, rsp);
4724 movptr(c_rarg0, (uintptr_t) "mismatch");
4725 call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper)));
4726 should_not_reach_here();
4727 }
4728 bind(L_done);
4729
4730 BLOCK_COMMENT("} verify_secondary_supers_table");
4731 }
4732
4733 #undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS
4734
4735 void MacroAssembler::clinit_barrier(Register klass, Label* L_fast_path, Label* L_slow_path) {
4736 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
4737
4738 Label L_fallthrough;
4739 if (L_fast_path == nullptr) {
4740 L_fast_path = &L_fallthrough;
4741 } else if (L_slow_path == nullptr) {
4742 L_slow_path = &L_fallthrough;
4743 }
4744
4745 // Fast path check: class is fully initialized.
4746 // init_state needs acquire, but x86 is TSO, and so we are already good.
4747 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4748 jcc(Assembler::equal, *L_fast_path);
4749
4750 // Fast path check: current thread is initializer thread
4751 cmpptr(r15_thread, Address(klass, InstanceKlass::init_thread_offset()));
4752 if (L_slow_path == &L_fallthrough) {
4753 jcc(Assembler::equal, *L_fast_path);
4754 bind(*L_slow_path);
4755 } else if (L_fast_path == &L_fallthrough) {
4756 jcc(Assembler::notEqual, *L_slow_path);
4757 bind(*L_fast_path);
4758 } else {
4759 Unimplemented();
4760 }
4761 }
4762
4763 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4764 if (VM_Version::supports_cmov()) {
4765 cmovl(cc, dst, src);
4766 } else {
4767 Label L;
4768 jccb(negate_condition(cc), L);
4769 movl(dst, src);
4770 bind(L);
4771 }
4772 }
4773
4774 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4775 if (VM_Version::supports_cmov()) {
4776 cmovl(cc, dst, src);
4777 } else {
4778 Label L;
4779 jccb(negate_condition(cc), L);
4780 movl(dst, src);
4781 bind(L);
4782 }
4783 }
4784
4785 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4786 if (!VerifyOops) return;
4787
4788 BLOCK_COMMENT("verify_oop {");
4789 push(rscratch1);
4790 push(rax); // save rax
4791 push(reg); // pass register argument
4792
4793 // Pass register number to verify_oop_subroutine
4794 const char* b = nullptr;
4795 {
4796 ResourceMark rm;
4797 stringStream ss;
4798 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4799 b = code_string(ss.as_string());
4800 }
4801 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
4802 pushptr(buffer.addr(), rscratch1);
4803
4804 // call indirectly to solve generation ordering problem
4805 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4806 call(rax);
4807 // Caller pops the arguments (oop, message) and restores rax, r10
4808 BLOCK_COMMENT("} verify_oop");
4809 }
4810
4811 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4812 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4813 // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
4814 // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
4815 vpternlogd(dst, 0xFF, dst, dst, vector_len);
4816 } else if (VM_Version::supports_avx()) {
4817 vpcmpeqd(dst, dst, dst, vector_len);
4818 } else {
4819 pcmpeqd(dst, dst);
4820 }
4821 }
4822
4823 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4824 int extra_slot_offset) {
4825 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4826 int stackElementSize = Interpreter::stackElementSize;
4827 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4828 #ifdef ASSERT
4829 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4830 assert(offset1 - offset == stackElementSize, "correct arithmetic");
4831 #endif
4832 Register scale_reg = noreg;
4833 Address::ScaleFactor scale_factor = Address::no_scale;
4834 if (arg_slot.is_constant()) {
4835 offset += arg_slot.as_constant() * stackElementSize;
4836 } else {
4837 scale_reg = arg_slot.as_register();
4838 scale_factor = Address::times(stackElementSize);
4839 }
4840 offset += wordSize; // return PC is on stack
4841 return Address(rsp, scale_reg, scale_factor, offset);
4842 }
4843
4844 // Handle the receiver type profile update given the "recv" klass.
4845 //
4846 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
4847 // If there are no matching or claimable receiver entries in RD, updates
4848 // the polymorphic counter.
4849 //
4850 // This code expected to run by either the interpreter or JIT-ed code, without
4851 // extra synchronization. For safety, receiver cells are claimed atomically, which
4852 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
4853 // counter updates are not atomic.
4854 //
4855 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
4856 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
4857 int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
4858 int poly_count_offset = in_bytes(CounterData::count_offset());
4859 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
4860 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
4861
4862 // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
4863 assert(is_aligned(mdp_offset, BytesPerWord), "sanity");
4864 base_receiver_offset += mdp_offset;
4865 end_receiver_offset += mdp_offset;
4866 poly_count_offset += mdp_offset;
4867
4868 // Scale down to optimize encoding. Slots are pointer-sized.
4869 assert(is_aligned(base_receiver_offset, BytesPerWord), "sanity");
4870 assert(is_aligned(end_receiver_offset, BytesPerWord), "sanity");
4871 assert(is_aligned(poly_count_offset, BytesPerWord), "sanity");
4872 assert(is_aligned(receiver_step, BytesPerWord), "sanity");
4873 assert(is_aligned(receiver_to_count_step, BytesPerWord), "sanity");
4874 base_receiver_offset >>= LogBytesPerWord;
4875 end_receiver_offset >>= LogBytesPerWord;
4876 poly_count_offset >>= LogBytesPerWord;
4877 receiver_step >>= LogBytesPerWord;
4878 receiver_to_count_step >>= LogBytesPerWord;
4879
4880 #ifdef ASSERT
4881 // We are about to walk the MDO slots without asking for offsets.
4882 // Check that our math hits all the right spots.
4883 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
4884 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
4885 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
4886 int offset = base_receiver_offset + receiver_step*c;
4887 int count_offset = offset + receiver_to_count_step;
4888 assert((offset << LogBytesPerWord) == real_recv_offset, "receiver slot math");
4889 assert((count_offset << LogBytesPerWord) == real_count_offset, "receiver count math");
4890 }
4891 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
4892 assert(poly_count_offset << LogBytesPerWord == real_poly_count_offset, "poly counter math");
4893 #endif
4894
4895 // Corner case: no profile table. Increment poly counter and exit.
4896 if (ReceiverTypeData::row_limit() == 0) {
4897 addptr(Address(mdp, poly_count_offset, Address::times_ptr), DataLayout::counter_increment);
4898 return;
4899 }
4900
4901 Register offset = rscratch1;
4902
4903 Label L_loop_search_receiver, L_loop_search_empty;
4904 Label L_restart, L_found_recv, L_found_empty, L_count_update;
4905
4906 // The code here recognizes three major cases:
4907 // A. Fastest: receiver found in the table
4908 // B. Fast: no receiver in the table, and the table is full
4909 // C. Slow: no receiver in the table, free slots in the table
4910 //
4911 // The case A performance is most important, as perfectly-behaved code would end up
4912 // there, especially with larger TypeProfileWidth. The case B performance is
4913 // important as well, this is where bulk of code would land for normally megamorphic
4914 // cases. The case C performance is not essential, its job is to deal with installation
4915 // races, we optimize for code density instead. Case C needs to make sure that receiver
4916 // rows are only claimed once. This makes sure we never overwrite a row for another
4917 // receiver and never duplicate the receivers in the list, making profile type-accurate.
4918 //
4919 // It is very tempting to handle these cases in a single loop, and claim the first slot
4920 // without checking the rest of the table. But, profiling code should tolerate free slots
4921 // in the table, as class unloading can clear them. After such cleanup, the receiver
4922 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
4923 // to complete, before trying to install new slots. Splitting the code in several tight
4924 // loops also helpfully optimizes for cases A and B.
4925 //
4926 // This code is effectively:
4927 //
4928 // restart:
4929 // // Fastest: receiver is already installed
4930 // for (i = 0; i < receiver_count(); i++) {
4931 // if (receiver(i) == recv) goto found_recv(i);
4932 // }
4933 //
4934 // // Fast: no receiver, but profile is not full
4935 // for (i = 0; i < receiver_count(); i++) {
4936 // if (receiver(i) == null) goto found_null(i);
4937 // }
4938 //
4939 // // Slow: profile is full, polymorphic case
4940 // count++;
4941 // return
4942 //
4943 // // Slow: try to install receiver
4944 // found_null(i):
4945 // CAS(&receiver(i), null, recv);
4946 // goto restart
4947 //
4948 // found_recv(i):
4949 // *receiver_count(i)++
4950 //
4951
4952 bind(L_restart);
4953
4954 // Fastest: receiver is already installed
4955 movptr(offset, base_receiver_offset);
4956 bind(L_loop_search_receiver);
4957 cmpptr(recv, Address(mdp, offset, Address::times_ptr));
4958 jccb(Assembler::equal, L_found_recv);
4959 addptr(offset, receiver_step);
4960 cmpptr(offset, end_receiver_offset);
4961 jccb(Assembler::notEqual, L_loop_search_receiver);
4962
4963 // Fast: no receiver, but profile is not full
4964 movptr(offset, base_receiver_offset);
4965 bind(L_loop_search_empty);
4966 cmpptr(Address(mdp, offset, Address::times_ptr), NULL_WORD);
4967 jccb(Assembler::equal, L_found_empty);
4968 addptr(offset, receiver_step);
4969 cmpptr(offset, end_receiver_offset);
4970 jccb(Assembler::notEqual, L_loop_search_empty);
4971
4972 // Slow: Receiver is not found and table is full.
4973 // Increment polymorphic counter instead of receiver slot.
4974 movptr(offset, poly_count_offset);
4975 jmpb(L_count_update);
4976
4977 // Slowest: try to install receiver
4978 bind(L_found_empty);
4979
4980 // Atomically swing receiver slot: null -> recv.
4981 //
4982 // The update code uses CAS, which wants RAX register specifically, *and* it needs
4983 // other important registers untouched, as they form the address. Therefore, we need
4984 // to shift any important registers from RAX into some other spare register. If we
4985 // have a spare register, we are forced to save it on stack here.
4986
4987 Register spare_reg = noreg;
4988 Register shifted_mdp = mdp;
4989 Register shifted_recv = recv;
4990 if (recv == rax || mdp == rax) {
4991 spare_reg = (recv != rbx && mdp != rbx) ? rbx :
4992 (recv != rcx && mdp != rcx) ? rcx :
4993 rdx;
4994 assert_different_registers(mdp, recv, offset, spare_reg);
4995
4996 push(spare_reg);
4997 if (recv == rax) {
4998 movptr(spare_reg, recv);
4999 shifted_recv = spare_reg;
5000 } else {
5001 assert(mdp == rax, "Remaining case");
5002 movptr(spare_reg, mdp);
5003 shifted_mdp = spare_reg;
5004 }
5005 } else {
5006 push(rax);
5007 }
5008
5009 // None of the important registers are in RAX after this shuffle.
5010 assert_different_registers(rax, shifted_mdp, shifted_recv, offset);
5011
5012 xorptr(rax, rax);
5013 cmpxchgptr(shifted_recv, Address(shifted_mdp, offset, Address::times_ptr));
5014
5015 // Unshift registers.
5016 if (recv == rax || mdp == rax) {
5017 movptr(rax, spare_reg);
5018 pop(spare_reg);
5019 } else {
5020 pop(rax);
5021 }
5022
5023 // CAS success means the slot now has the receiver we want. CAS failure means
5024 // something had claimed the slot concurrently: it can be the same receiver we want,
5025 // or something else. Since this is a slow path, we can optimize for code density,
5026 // and just restart the search from the beginning.
5027 jmpb(L_restart);
5028
5029 // Found a receiver, convert its slot offset to corresponding count offset.
5030 bind(L_found_recv);
5031 addptr(offset, receiver_to_count_step);
5032
5033 // Finally, update the counter
5034 bind(L_count_update);
5035 addptr(Address(mdp, offset, Address::times_ptr), DataLayout::counter_increment);
5036 }
5037
5038 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
5039 if (!VerifyOops) return;
5040
5041 push(rscratch1);
5042 push(rax); // save rax,
5043 // addr may contain rsp so we will have to adjust it based on the push
5044 // we just did (and on 64 bit we do two pushes)
5045 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5046 // stores rax into addr which is backwards of what was intended.
5047 if (addr.uses(rsp)) {
5048 lea(rax, addr);
5049 pushptr(Address(rax, 2 * BytesPerWord));
5050 } else {
5051 pushptr(addr);
5052 }
5053
5054 // Pass register number to verify_oop_subroutine
5055 const char* b = nullptr;
5056 {
5057 ResourceMark rm;
5058 stringStream ss;
5059 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
5060 b = code_string(ss.as_string());
5061 }
5062 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5063 pushptr(buffer.addr(), rscratch1);
5064
5065 // call indirectly to solve generation ordering problem
5066 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5067 call(rax);
5068 // Caller pops the arguments (addr, message) and restores rax, r10.
5069 }
5070
5071 void MacroAssembler::verify_tlab() {
5072 #ifdef ASSERT
5073 if (UseTLAB && VerifyOops) {
5074 Label next, ok;
5075 Register t1 = rsi;
5076
5077 push(t1);
5078
5079 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5080 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_start_offset())));
5081 jcc(Assembler::aboveEqual, next);
5082 STOP("assert(top >= start)");
5083 should_not_reach_here();
5084
5085 bind(next);
5086 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_end_offset())));
5087 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5088 jcc(Assembler::aboveEqual, ok);
5089 STOP("assert(top <= end)");
5090 should_not_reach_here();
5091
5092 bind(ok);
5093 pop(t1);
5094 }
5095 #endif
5096 }
5097
5098 class ControlWord {
5099 public:
5100 int32_t _value;
5101
5102 int rounding_control() const { return (_value >> 10) & 3 ; }
5103 int precision_control() const { return (_value >> 8) & 3 ; }
5104 bool precision() const { return ((_value >> 5) & 1) != 0; }
5105 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5106 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5107 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5108 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5109 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5110
5111 void print() const {
5112 // rounding control
5113 const char* rc;
5114 switch (rounding_control()) {
5115 case 0: rc = "round near"; break;
5116 case 1: rc = "round down"; break;
5117 case 2: rc = "round up "; break;
5118 case 3: rc = "chop "; break;
5119 default:
5120 rc = nullptr; // silence compiler warnings
5121 fatal("Unknown rounding control: %d", rounding_control());
5122 };
5123 // precision control
5124 const char* pc;
5125 switch (precision_control()) {
5126 case 0: pc = "24 bits "; break;
5127 case 1: pc = "reserved"; break;
5128 case 2: pc = "53 bits "; break;
5129 case 3: pc = "64 bits "; break;
5130 default:
5131 pc = nullptr; // silence compiler warnings
5132 fatal("Unknown precision control: %d", precision_control());
5133 };
5134 // flags
5135 char f[9];
5136 f[0] = ' ';
5137 f[1] = ' ';
5138 f[2] = (precision ()) ? 'P' : 'p';
5139 f[3] = (underflow ()) ? 'U' : 'u';
5140 f[4] = (overflow ()) ? 'O' : 'o';
5141 f[5] = (zero_divide ()) ? 'Z' : 'z';
5142 f[6] = (denormalized()) ? 'D' : 'd';
5143 f[7] = (invalid ()) ? 'I' : 'i';
5144 f[8] = '\x0';
5145 // output
5146 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5147 }
5148
5149 };
5150
5151 class StatusWord {
5152 public:
5153 int32_t _value;
5154
5155 bool busy() const { return ((_value >> 15) & 1) != 0; }
5156 bool C3() const { return ((_value >> 14) & 1) != 0; }
5157 bool C2() const { return ((_value >> 10) & 1) != 0; }
5158 bool C1() const { return ((_value >> 9) & 1) != 0; }
5159 bool C0() const { return ((_value >> 8) & 1) != 0; }
5160 int top() const { return (_value >> 11) & 7 ; }
5161 bool error_status() const { return ((_value >> 7) & 1) != 0; }
5162 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
5163 bool precision() const { return ((_value >> 5) & 1) != 0; }
5164 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5165 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5166 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5167 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5168 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5169
5170 void print() const {
5171 // condition codes
5172 char c[5];
5173 c[0] = (C3()) ? '3' : '-';
5174 c[1] = (C2()) ? '2' : '-';
5175 c[2] = (C1()) ? '1' : '-';
5176 c[3] = (C0()) ? '0' : '-';
5177 c[4] = '\x0';
5178 // flags
5179 char f[9];
5180 f[0] = (error_status()) ? 'E' : '-';
5181 f[1] = (stack_fault ()) ? 'S' : '-';
5182 f[2] = (precision ()) ? 'P' : '-';
5183 f[3] = (underflow ()) ? 'U' : '-';
5184 f[4] = (overflow ()) ? 'O' : '-';
5185 f[5] = (zero_divide ()) ? 'Z' : '-';
5186 f[6] = (denormalized()) ? 'D' : '-';
5187 f[7] = (invalid ()) ? 'I' : '-';
5188 f[8] = '\x0';
5189 // output
5190 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
5191 }
5192
5193 };
5194
5195 class TagWord {
5196 public:
5197 int32_t _value;
5198
5199 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
5200
5201 void print() const {
5202 printf("%04x", _value & 0xFFFF);
5203 }
5204
5205 };
5206
5207 class FPU_Register {
5208 public:
5209 int32_t _m0;
5210 int32_t _m1;
5211 int16_t _ex;
5212
5213 bool is_indefinite() const {
5214 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5215 }
5216
5217 void print() const {
5218 char sign = (_ex < 0) ? '-' : '+';
5219 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
5220 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
5221 };
5222
5223 };
5224
5225 class FPU_State {
5226 public:
5227 enum {
5228 register_size = 10,
5229 number_of_registers = 8,
5230 register_mask = 7
5231 };
5232
5233 ControlWord _control_word;
5234 StatusWord _status_word;
5235 TagWord _tag_word;
5236 int32_t _error_offset;
5237 int32_t _error_selector;
5238 int32_t _data_offset;
5239 int32_t _data_selector;
5240 int8_t _register[register_size * number_of_registers];
5241
5242 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5243 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
5244
5245 const char* tag_as_string(int tag) const {
5246 switch (tag) {
5247 case 0: return "valid";
5248 case 1: return "zero";
5249 case 2: return "special";
5250 case 3: return "empty";
5251 }
5252 ShouldNotReachHere();
5253 return nullptr;
5254 }
5255
5256 void print() const {
5257 // print computation registers
5258 { int t = _status_word.top();
5259 for (int i = 0; i < number_of_registers; i++) {
5260 int j = (i - t) & register_mask;
5261 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5262 st(j)->print();
5263 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5264 }
5265 }
5266 printf("\n");
5267 // print control registers
5268 printf("ctrl = "); _control_word.print(); printf("\n");
5269 printf("stat = "); _status_word .print(); printf("\n");
5270 printf("tags = "); _tag_word .print(); printf("\n");
5271 }
5272
5273 };
5274
5275 class Flag_Register {
5276 public:
5277 int32_t _value;
5278
5279 bool overflow() const { return ((_value >> 11) & 1) != 0; }
5280 bool direction() const { return ((_value >> 10) & 1) != 0; }
5281 bool sign() const { return ((_value >> 7) & 1) != 0; }
5282 bool zero() const { return ((_value >> 6) & 1) != 0; }
5283 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
5284 bool parity() const { return ((_value >> 2) & 1) != 0; }
5285 bool carry() const { return ((_value >> 0) & 1) != 0; }
5286
5287 void print() const {
5288 // flags
5289 char f[8];
5290 f[0] = (overflow ()) ? 'O' : '-';
5291 f[1] = (direction ()) ? 'D' : '-';
5292 f[2] = (sign ()) ? 'S' : '-';
5293 f[3] = (zero ()) ? 'Z' : '-';
5294 f[4] = (auxiliary_carry()) ? 'A' : '-';
5295 f[5] = (parity ()) ? 'P' : '-';
5296 f[6] = (carry ()) ? 'C' : '-';
5297 f[7] = '\x0';
5298 // output
5299 printf("%08x flags = %s", _value, f);
5300 }
5301
5302 };
5303
5304 class IU_Register {
5305 public:
5306 int32_t _value;
5307
5308 void print() const {
5309 printf("%08x %11d", _value, _value);
5310 }
5311
5312 };
5313
5314 class IU_State {
5315 public:
5316 Flag_Register _eflags;
5317 IU_Register _rdi;
5318 IU_Register _rsi;
5319 IU_Register _rbp;
5320 IU_Register _rsp;
5321 IU_Register _rbx;
5322 IU_Register _rdx;
5323 IU_Register _rcx;
5324 IU_Register _rax;
5325
5326 void print() const {
5327 // computation registers
5328 printf("rax, = "); _rax.print(); printf("\n");
5329 printf("rbx, = "); _rbx.print(); printf("\n");
5330 printf("rcx = "); _rcx.print(); printf("\n");
5331 printf("rdx = "); _rdx.print(); printf("\n");
5332 printf("rdi = "); _rdi.print(); printf("\n");
5333 printf("rsi = "); _rsi.print(); printf("\n");
5334 printf("rbp, = "); _rbp.print(); printf("\n");
5335 printf("rsp = "); _rsp.print(); printf("\n");
5336 printf("\n");
5337 // control registers
5338 printf("flgs = "); _eflags.print(); printf("\n");
5339 }
5340 };
5341
5342
5343 class CPU_State {
5344 public:
5345 FPU_State _fpu_state;
5346 IU_State _iu_state;
5347
5348 void print() const {
5349 printf("--------------------------------------------------\n");
5350 _iu_state .print();
5351 printf("\n");
5352 _fpu_state.print();
5353 printf("--------------------------------------------------\n");
5354 }
5355
5356 };
5357
5358
5359 static void _print_CPU_state(CPU_State* state) {
5360 state->print();
5361 };
5362
5363
5364 void MacroAssembler::print_CPU_state() {
5365 push_CPU_state();
5366 push(rsp); // pass CPU state
5367 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5368 addptr(rsp, wordSize); // discard argument
5369 pop_CPU_state();
5370 }
5371
5372 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5373 // Either restore the MXCSR register after returning from the JNI Call
5374 // or verify that it wasn't changed (with -Xcheck:jni flag).
5375 if (RestoreMXCSROnJNICalls) {
5376 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5377 } else if (CheckJNICalls) {
5378 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5379 }
5380 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5381 vzeroupper();
5382 }
5383
5384 // ((OopHandle)result).resolve();
5385 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5386 assert_different_registers(result, tmp);
5387
5388 // Only 64 bit platforms support GCs that require a tmp register
5389 // Only IN_HEAP loads require a thread_tmp register
5390 // OopHandle::resolve is an indirection like jobject.
5391 access_load_at(T_OBJECT, IN_NATIVE,
5392 result, Address(result, 0), tmp);
5393 }
5394
5395 // ((WeakHandle)result).resolve();
5396 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5397 assert_different_registers(rresult, rtmp);
5398 Label resolved;
5399
5400 // A null weak handle resolves to null.
5401 cmpptr(rresult, 0);
5402 jcc(Assembler::equal, resolved);
5403
5404 // Only 64 bit platforms support GCs that require a tmp register
5405 // Only IN_HEAP loads require a thread_tmp register
5406 // WeakHandle::resolve is an indirection like jweak.
5407 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5408 rresult, Address(rresult, 0), rtmp);
5409 bind(resolved);
5410 }
5411
5412 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5413 // get mirror
5414 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5415 load_method_holder(mirror, method);
5416 movptr(mirror, Address(mirror, mirror_offset));
5417 resolve_oop_handle(mirror, tmp);
5418 }
5419
5420 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5421 load_method_holder(rresult, rmethod);
5422 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5423 }
5424
5425 void MacroAssembler::load_method_holder(Register holder, Register method) {
5426 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
5427 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5428 movptr(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5429 }
5430
5431 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
5432 assert(UseCompactObjectHeaders, "expect compact object headers");
5433 movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
5434 shrq(dst, markWord::klass_shift);
5435 }
5436
5437 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5438 assert_different_registers(src, tmp);
5439 assert_different_registers(dst, tmp);
5440
5441 if (UseCompactObjectHeaders) {
5442 load_narrow_klass_compact(dst, src);
5443 decode_klass_not_null(dst, tmp);
5444 } else {
5445 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5446 decode_klass_not_null(dst, tmp);
5447 }
5448 }
5449
5450 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5451 assert(!UseCompactObjectHeaders, "not with compact headers");
5452 assert_different_registers(src, tmp);
5453 assert_different_registers(dst, tmp);
5454 encode_klass_not_null(src, tmp);
5455 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5456 }
5457
5458 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) {
5459 if (UseCompactObjectHeaders) {
5460 assert(tmp != noreg, "need tmp");
5461 assert_different_registers(klass, obj, tmp);
5462 load_narrow_klass_compact(tmp, obj);
5463 cmpl(klass, tmp);
5464 } else {
5465 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
5466 }
5467 }
5468
5469 void MacroAssembler::cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2) {
5470 if (UseCompactObjectHeaders) {
5471 assert(tmp2 != noreg, "need tmp2");
5472 assert_different_registers(obj1, obj2, tmp1, tmp2);
5473 load_narrow_klass_compact(tmp1, obj1);
5474 load_narrow_klass_compact(tmp2, obj2);
5475 cmpl(tmp1, tmp2);
5476 } else {
5477 movl(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes()));
5478 cmpl(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes()));
5479 }
5480 }
5481
5482 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5483 Register tmp1) {
5484 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5485 decorators = AccessInternal::decorator_fixup(decorators, type);
5486 bool as_raw = (decorators & AS_RAW) != 0;
5487 if (as_raw) {
5488 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1);
5489 } else {
5490 bs->load_at(this, decorators, type, dst, src, tmp1);
5491 }
5492 }
5493
5494 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5495 Register tmp1, Register tmp2, Register tmp3) {
5496 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5497 decorators = AccessInternal::decorator_fixup(decorators, type);
5498 bool as_raw = (decorators & AS_RAW) != 0;
5499 if (as_raw) {
5500 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5501 } else {
5502 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5503 }
5504 }
5505
5506 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5507 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1);
5508 }
5509
5510 // Doesn't do verification, generates fixed size code
5511 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5512 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1);
5513 }
5514
5515 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5516 Register tmp2, Register tmp3, DecoratorSet decorators) {
5517 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5518 }
5519
5520 // Used for storing nulls.
5521 void MacroAssembler::store_heap_oop_null(Address dst) {
5522 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5523 }
5524
5525 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5526 assert(!UseCompactObjectHeaders, "Don't use with compact headers");
5527 // Store to klass gap in destination
5528 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5529 }
5530
5531 #ifdef ASSERT
5532 void MacroAssembler::verify_heapbase(const char* msg) {
5533 assert (UseCompressedOops, "should be compressed");
5534 assert (Universe::heap() != nullptr, "java heap should be initialized");
5535 if (CheckCompressedOops) {
5536 Label ok;
5537 ExternalAddress src2(CompressedOops::base_addr());
5538 const bool is_src2_reachable = reachable(src2);
5539 if (!is_src2_reachable) {
5540 push(rscratch1); // cmpptr trashes rscratch1
5541 }
5542 cmpptr(r12_heapbase, src2, rscratch1);
5543 jcc(Assembler::equal, ok);
5544 STOP(msg);
5545 bind(ok);
5546 if (!is_src2_reachable) {
5547 pop(rscratch1);
5548 }
5549 }
5550 }
5551 #endif
5552
5553 // Algorithm must match oop.inline.hpp encode_heap_oop.
5554 void MacroAssembler::encode_heap_oop(Register r) {
5555 #ifdef ASSERT
5556 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5557 #endif
5558 verify_oop_msg(r, "broken oop in encode_heap_oop");
5559 if (CompressedOops::base() == nullptr) {
5560 if (CompressedOops::shift() != 0) {
5561 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5562 shrq(r, LogMinObjAlignmentInBytes);
5563 }
5564 return;
5565 }
5566 testq(r, r);
5567 cmovq(Assembler::equal, r, r12_heapbase);
5568 subq(r, r12_heapbase);
5569 shrq(r, LogMinObjAlignmentInBytes);
5570 }
5571
5572 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5573 #ifdef ASSERT
5574 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5575 if (CheckCompressedOops) {
5576 Label ok;
5577 testq(r, r);
5578 jcc(Assembler::notEqual, ok);
5579 STOP("null oop passed to encode_heap_oop_not_null");
5580 bind(ok);
5581 }
5582 #endif
5583 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5584 if (CompressedOops::base() != nullptr) {
5585 subq(r, r12_heapbase);
5586 }
5587 if (CompressedOops::shift() != 0) {
5588 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5589 shrq(r, LogMinObjAlignmentInBytes);
5590 }
5591 }
5592
5593 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5594 #ifdef ASSERT
5595 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5596 if (CheckCompressedOops) {
5597 Label ok;
5598 testq(src, src);
5599 jcc(Assembler::notEqual, ok);
5600 STOP("null oop passed to encode_heap_oop_not_null2");
5601 bind(ok);
5602 }
5603 #endif
5604 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5605 if (dst != src) {
5606 movq(dst, src);
5607 }
5608 if (CompressedOops::base() != nullptr) {
5609 subq(dst, r12_heapbase);
5610 }
5611 if (CompressedOops::shift() != 0) {
5612 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5613 shrq(dst, LogMinObjAlignmentInBytes);
5614 }
5615 }
5616
5617 void MacroAssembler::decode_heap_oop(Register r) {
5618 #ifdef ASSERT
5619 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5620 #endif
5621 if (CompressedOops::base() == nullptr) {
5622 if (CompressedOops::shift() != 0) {
5623 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5624 shlq(r, LogMinObjAlignmentInBytes);
5625 }
5626 } else {
5627 Label done;
5628 shlq(r, LogMinObjAlignmentInBytes);
5629 jccb(Assembler::equal, done);
5630 addq(r, r12_heapbase);
5631 bind(done);
5632 }
5633 verify_oop_msg(r, "broken oop in decode_heap_oop");
5634 }
5635
5636 void MacroAssembler::decode_heap_oop_not_null(Register r) {
5637 // Note: it will change flags
5638 assert (UseCompressedOops, "should only be used for compressed headers");
5639 assert (Universe::heap() != nullptr, "java heap should be initialized");
5640 // Cannot assert, unverified entry point counts instructions (see .ad file)
5641 // vtableStubs also counts instructions in pd_code_size_limit.
5642 // Also do not verify_oop as this is called by verify_oop.
5643 if (CompressedOops::shift() != 0) {
5644 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5645 shlq(r, LogMinObjAlignmentInBytes);
5646 if (CompressedOops::base() != nullptr) {
5647 addq(r, r12_heapbase);
5648 }
5649 } else {
5650 assert (CompressedOops::base() == nullptr, "sanity");
5651 }
5652 }
5653
5654 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5655 // Note: it will change flags
5656 assert (UseCompressedOops, "should only be used for compressed headers");
5657 assert (Universe::heap() != nullptr, "java heap should be initialized");
5658 // Cannot assert, unverified entry point counts instructions (see .ad file)
5659 // vtableStubs also counts instructions in pd_code_size_limit.
5660 // Also do not verify_oop as this is called by verify_oop.
5661 if (CompressedOops::shift() != 0) {
5662 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5663 if (LogMinObjAlignmentInBytes == Address::times_8) {
5664 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5665 } else {
5666 if (dst != src) {
5667 movq(dst, src);
5668 }
5669 shlq(dst, LogMinObjAlignmentInBytes);
5670 if (CompressedOops::base() != nullptr) {
5671 addq(dst, r12_heapbase);
5672 }
5673 }
5674 } else {
5675 assert (CompressedOops::base() == nullptr, "sanity");
5676 if (dst != src) {
5677 movq(dst, src);
5678 }
5679 }
5680 }
5681
5682 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5683 BLOCK_COMMENT("encode_klass_not_null {");
5684 assert_different_registers(r, tmp);
5685 if (CompressedKlassPointers::base() != nullptr) {
5686 if (AOTCodeCache::is_on_for_dump()) {
5687 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5688 } else {
5689 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5690 }
5691 subq(r, tmp);
5692 }
5693 if (CompressedKlassPointers::shift() != 0) {
5694 shrq(r, CompressedKlassPointers::shift());
5695 }
5696 BLOCK_COMMENT("} encode_klass_not_null");
5697 }
5698
5699 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5700 BLOCK_COMMENT("encode_and_move_klass_not_null {");
5701 assert_different_registers(src, dst);
5702 if (CompressedKlassPointers::base() != nullptr) {
5703 if (AOTCodeCache::is_on_for_dump()) {
5704 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5705 negq(dst);
5706 } else {
5707 movptr(dst, -(intptr_t)CompressedKlassPointers::base());
5708 }
5709 addq(dst, src);
5710 } else {
5711 movptr(dst, src);
5712 }
5713 if (CompressedKlassPointers::shift() != 0) {
5714 shrq(dst, CompressedKlassPointers::shift());
5715 }
5716 BLOCK_COMMENT("} encode_and_move_klass_not_null");
5717 }
5718
5719 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5720 BLOCK_COMMENT("decode_klass_not_null {");
5721 assert_different_registers(r, tmp);
5722 // Note: it will change flags
5723 // Cannot assert, unverified entry point counts instructions (see .ad file)
5724 // vtableStubs also counts instructions in pd_code_size_limit.
5725 // Also do not verify_oop as this is called by verify_oop.
5726 if (CompressedKlassPointers::shift() != 0) {
5727 shlq(r, CompressedKlassPointers::shift());
5728 }
5729 if (CompressedKlassPointers::base() != nullptr) {
5730 if (AOTCodeCache::is_on_for_dump()) {
5731 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5732 } else {
5733 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5734 }
5735 addq(r, tmp);
5736 }
5737 BLOCK_COMMENT("} decode_klass_not_null");
5738 }
5739
5740 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5741 BLOCK_COMMENT("decode_and_move_klass_not_null {");
5742 assert_different_registers(src, dst);
5743 // Note: it will change flags
5744 // Cannot assert, unverified entry point counts instructions (see .ad file)
5745 // vtableStubs also counts instructions in pd_code_size_limit.
5746 // Also do not verify_oop as this is called by verify_oop.
5747
5748 if (CompressedKlassPointers::base() == nullptr &&
5749 CompressedKlassPointers::shift() == 0) {
5750 // The best case scenario is that there is no base or shift. Then it is already
5751 // a pointer that needs nothing but a register rename.
5752 movl(dst, src);
5753 } else {
5754 if (CompressedKlassPointers::shift() <= Address::times_8) {
5755 if (CompressedKlassPointers::base() != nullptr) {
5756 if (AOTCodeCache::is_on_for_dump()) {
5757 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5758 } else {
5759 movptr(dst, (intptr_t)CompressedKlassPointers::base());
5760 }
5761 } else {
5762 xorq(dst, dst);
5763 }
5764 if (CompressedKlassPointers::shift() != 0) {
5765 assert(CompressedKlassPointers::shift() == Address::times_8, "klass not aligned on 64bits?");
5766 leaq(dst, Address(dst, src, Address::times_8, 0));
5767 } else {
5768 addq(dst, src);
5769 }
5770 } else {
5771 if (CompressedKlassPointers::base() != nullptr) {
5772 if (AOTCodeCache::is_on_for_dump()) {
5773 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5774 shrq(dst, CompressedKlassPointers::shift());
5775 } else {
5776 const intptr_t base_right_shifted =
5777 (intptr_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
5778 movptr(dst, base_right_shifted);
5779 }
5780 } else {
5781 xorq(dst, dst);
5782 }
5783 addq(dst, src);
5784 shlq(dst, CompressedKlassPointers::shift());
5785 }
5786 }
5787 BLOCK_COMMENT("} decode_and_move_klass_not_null");
5788 }
5789
5790 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5791 assert (UseCompressedOops, "should only be used for compressed headers");
5792 assert (Universe::heap() != nullptr, "java heap should be initialized");
5793 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5794 int oop_index = oop_recorder()->find_index(obj);
5795 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5796 mov_narrow_oop(dst, oop_index, rspec);
5797 }
5798
5799 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5800 assert (UseCompressedOops, "should only be used for compressed headers");
5801 assert (Universe::heap() != nullptr, "java heap should be initialized");
5802 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5803 int oop_index = oop_recorder()->find_index(obj);
5804 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5805 mov_narrow_oop(dst, oop_index, rspec);
5806 }
5807
5808 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5809 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5810 int klass_index = oop_recorder()->find_index(k);
5811 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5812 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5813 }
5814
5815 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5816 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5817 int klass_index = oop_recorder()->find_index(k);
5818 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5819 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5820 }
5821
5822 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5823 assert (UseCompressedOops, "should only be used for compressed headers");
5824 assert (Universe::heap() != nullptr, "java heap should be initialized");
5825 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5826 int oop_index = oop_recorder()->find_index(obj);
5827 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5828 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5829 }
5830
5831 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5832 assert (UseCompressedOops, "should only be used for compressed headers");
5833 assert (Universe::heap() != nullptr, "java heap should be initialized");
5834 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5835 int oop_index = oop_recorder()->find_index(obj);
5836 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5837 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5838 }
5839
5840 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5841 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5842 int klass_index = oop_recorder()->find_index(k);
5843 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5844 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5845 }
5846
5847 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5848 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5849 int klass_index = oop_recorder()->find_index(k);
5850 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5851 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5852 }
5853
5854 void MacroAssembler::reinit_heapbase() {
5855 if (UseCompressedOops) {
5856 if (Universe::heap() != nullptr && !AOTCodeCache::is_on_for_dump()) {
5857 if (CompressedOops::base() == nullptr) {
5858 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5859 } else {
5860 mov64(r12_heapbase, (int64_t)CompressedOops::base());
5861 }
5862 } else {
5863 movptr(r12_heapbase, ExternalAddress(CompressedOops::base_addr()));
5864 }
5865 }
5866 }
5867
5868 #ifdef COMPILER2
5869
5870 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5871 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5872 // cnt - number of qwords (8-byte words).
5873 // base - start address, qword aligned.
5874 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5875 bool use64byteVector = (MaxVectorSize == 64) && (CopyAVX3Threshold == 0);
5876 if (use64byteVector) {
5877 vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5878 } else if (MaxVectorSize >= 32) {
5879 vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5880 } else {
5881 pxor(xtmp, xtmp);
5882 }
5883 jmp(L_zero_64_bytes);
5884
5885 BIND(L_loop);
5886 if (MaxVectorSize >= 32) {
5887 fill64(base, 0, xtmp, use64byteVector);
5888 } else {
5889 movdqu(Address(base, 0), xtmp);
5890 movdqu(Address(base, 16), xtmp);
5891 movdqu(Address(base, 32), xtmp);
5892 movdqu(Address(base, 48), xtmp);
5893 }
5894 addptr(base, 64);
5895
5896 BIND(L_zero_64_bytes);
5897 subptr(cnt, 8);
5898 jccb(Assembler::greaterEqual, L_loop);
5899
5900 // Copy trailing 64 bytes
5901 if (use64byteVector) {
5902 addptr(cnt, 8);
5903 jccb(Assembler::equal, L_end);
5904 fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5905 jmp(L_end);
5906 } else {
5907 addptr(cnt, 4);
5908 jccb(Assembler::less, L_tail);
5909 if (MaxVectorSize >= 32) {
5910 vmovdqu(Address(base, 0), xtmp);
5911 } else {
5912 movdqu(Address(base, 0), xtmp);
5913 movdqu(Address(base, 16), xtmp);
5914 }
5915 }
5916 addptr(base, 32);
5917 subptr(cnt, 4);
5918
5919 BIND(L_tail);
5920 addptr(cnt, 4);
5921 jccb(Assembler::lessEqual, L_end);
5922 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5923 fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5924 } else {
5925 decrement(cnt);
5926
5927 BIND(L_sloop);
5928 movq(Address(base, 0), xtmp);
5929 addptr(base, 8);
5930 decrement(cnt);
5931 jccb(Assembler::greaterEqual, L_sloop);
5932 }
5933 BIND(L_end);
5934 }
5935
5936 // Clearing constant sized memory using YMM/ZMM registers.
5937 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5938 assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
5939 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
5940
5941 int vector64_count = (cnt & (~0x7)) >> 3;
5942 cnt = cnt & 0x7;
5943 const int fill64_per_loop = 4;
5944 const int max_unrolled_fill64 = 8;
5945
5946 // 64 byte initialization loop.
5947 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5948 int start64 = 0;
5949 if (vector64_count > max_unrolled_fill64) {
5950 Label LOOP;
5951 Register index = rtmp;
5952
5953 start64 = vector64_count - (vector64_count % fill64_per_loop);
5954
5955 movl(index, 0);
5956 BIND(LOOP);
5957 for (int i = 0; i < fill64_per_loop; i++) {
5958 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5959 }
5960 addl(index, fill64_per_loop * 64);
5961 cmpl(index, start64 * 64);
5962 jccb(Assembler::less, LOOP);
5963 }
5964 for (int i = start64; i < vector64_count; i++) {
5965 fill64(base, i * 64, xtmp, use64byteVector);
5966 }
5967
5968 // Clear remaining 64 byte tail.
5969 int disp = vector64_count * 64;
5970 if (cnt) {
5971 switch (cnt) {
5972 case 1:
5973 movq(Address(base, disp), xtmp);
5974 break;
5975 case 2:
5976 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
5977 break;
5978 case 3:
5979 movl(rtmp, 0x7);
5980 kmovwl(mask, rtmp);
5981 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
5982 break;
5983 case 4:
5984 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5985 break;
5986 case 5:
5987 if (use64byteVector) {
5988 movl(rtmp, 0x1F);
5989 kmovwl(mask, rtmp);
5990 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5991 } else {
5992 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5993 movq(Address(base, disp + 32), xtmp);
5994 }
5995 break;
5996 case 6:
5997 if (use64byteVector) {
5998 movl(rtmp, 0x3F);
5999 kmovwl(mask, rtmp);
6000 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6001 } else {
6002 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6003 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
6004 }
6005 break;
6006 case 7:
6007 if (use64byteVector) {
6008 movl(rtmp, 0x7F);
6009 kmovwl(mask, rtmp);
6010 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6011 } else {
6012 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6013 movl(rtmp, 0x7);
6014 kmovwl(mask, rtmp);
6015 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
6016 }
6017 break;
6018 default:
6019 fatal("Unexpected length : %d\n",cnt);
6020 break;
6021 }
6022 }
6023 }
6024
6025 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
6026 bool is_large, KRegister mask) {
6027 // cnt - number of qwords (8-byte words).
6028 // base - start address, qword aligned.
6029 // is_large - if optimizers know cnt is larger than InitArrayShortSize
6030 assert(base==rdi, "base register must be edi for rep stos");
6031 assert(tmp==rax, "tmp register must be eax for rep stos");
6032 assert(cnt==rcx, "cnt register must be ecx for rep stos");
6033 assert(InitArrayShortSize % BytesPerLong == 0,
6034 "InitArrayShortSize should be the multiple of BytesPerLong");
6035
6036 Label DONE;
6037 if (!is_large || !UseXMMForObjInit) {
6038 xorptr(tmp, tmp);
6039 }
6040
6041 if (!is_large) {
6042 Label LOOP, LONG;
6043 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6044 jccb(Assembler::greater, LONG);
6045
6046 decrement(cnt);
6047 jccb(Assembler::negative, DONE); // Zero length
6048
6049 // Use individual pointer-sized stores for small counts:
6050 BIND(LOOP);
6051 movptr(Address(base, cnt, Address::times_ptr), tmp);
6052 decrement(cnt);
6053 jccb(Assembler::greaterEqual, LOOP);
6054 jmpb(DONE);
6055
6056 BIND(LONG);
6057 }
6058
6059 // Use longer rep-prefixed ops for non-small counts:
6060 if (UseFastStosb) {
6061 shlptr(cnt, 3); // convert to number of bytes
6062 rep_stosb();
6063 } else if (UseXMMForObjInit) {
6064 xmm_clear_mem(base, cnt, tmp, xtmp, mask);
6065 } else {
6066 rep_stos();
6067 }
6068
6069 BIND(DONE);
6070 }
6071
6072 #endif //COMPILER2
6073
6074
6075 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6076 Register to, Register value, Register count,
6077 Register rtmp, XMMRegister xtmp) {
6078 ShortBranchVerifier sbv(this);
6079 assert_different_registers(to, value, count, rtmp);
6080 Label L_exit;
6081 Label L_fill_2_bytes, L_fill_4_bytes;
6082
6083 #if defined(COMPILER2)
6084 if(MaxVectorSize >=32 &&
6085 VM_Version::supports_avx512vlbw() &&
6086 VM_Version::supports_bmi2()) {
6087 generate_fill_avx3(t, to, value, count, rtmp, xtmp);
6088 return;
6089 }
6090 #endif
6091
6092 int shift = -1;
6093 switch (t) {
6094 case T_BYTE:
6095 shift = 2;
6096 break;
6097 case T_SHORT:
6098 shift = 1;
6099 break;
6100 case T_INT:
6101 shift = 0;
6102 break;
6103 default: ShouldNotReachHere();
6104 }
6105
6106 if (t == T_BYTE) {
6107 andl(value, 0xff);
6108 movl(rtmp, value);
6109 shll(rtmp, 8);
6110 orl(value, rtmp);
6111 }
6112 if (t == T_SHORT) {
6113 andl(value, 0xffff);
6114 }
6115 if (t == T_BYTE || t == T_SHORT) {
6116 movl(rtmp, value);
6117 shll(rtmp, 16);
6118 orl(value, rtmp);
6119 }
6120
6121 cmpptr(count, 8 << shift); // Short arrays (< 32 bytes) fill by element
6122 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6123 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6124 Label L_skip_align2;
6125 // align source address at 4 bytes address boundary
6126 if (t == T_BYTE) {
6127 Label L_skip_align1;
6128 // One byte misalignment happens only for byte arrays
6129 testptr(to, 1);
6130 jccb(Assembler::zero, L_skip_align1);
6131 movb(Address(to, 0), value);
6132 increment(to);
6133 decrement(count);
6134 BIND(L_skip_align1);
6135 }
6136 // Two bytes misalignment happens only for byte and short (char) arrays
6137 testptr(to, 2);
6138 jccb(Assembler::zero, L_skip_align2);
6139 movw(Address(to, 0), value);
6140 addptr(to, 2);
6141 subptr(count, 1<<(shift-1));
6142 BIND(L_skip_align2);
6143 }
6144 {
6145 Label L_fill_32_bytes;
6146 if (!UseUnalignedLoadStores) {
6147 // align to 8 bytes, we know we are 4 byte aligned to start
6148 testptr(to, 4);
6149 jccb(Assembler::zero, L_fill_32_bytes);
6150 movl(Address(to, 0), value);
6151 addptr(to, 4);
6152 subptr(count, 1<<shift);
6153 }
6154 BIND(L_fill_32_bytes);
6155 {
6156 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6157 movdl(xtmp, value);
6158 if (UseAVX >= 2 && UseUnalignedLoadStores) {
6159 Label L_check_fill_32_bytes;
6160 if (UseAVX > 2) {
6161 // Fill 64-byte chunks
6162 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
6163
6164 // If number of bytes to fill < CopyAVX3Threshold, perform fill using AVX2
6165 cmpptr(count, CopyAVX3Threshold);
6166 jccb(Assembler::below, L_check_fill_64_bytes_avx2);
6167
6168 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
6169
6170 subptr(count, 16 << shift);
6171 jcc(Assembler::less, L_check_fill_32_bytes);
6172 align(16);
6173
6174 BIND(L_fill_64_bytes_loop_avx3);
6175 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
6176 addptr(to, 64);
6177 subptr(count, 16 << shift);
6178 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
6179 jmpb(L_check_fill_32_bytes);
6180
6181 BIND(L_check_fill_64_bytes_avx2);
6182 }
6183 // Fill 64-byte chunks
6184 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
6185
6186 subptr(count, 16 << shift);
6187 jcc(Assembler::less, L_check_fill_32_bytes);
6188
6189 // align data for 64-byte chunks
6190 Label L_fill_64_bytes_loop, L_align_64_bytes_loop;
6191 if (EnableX86ECoreOpts) {
6192 // align 'big' arrays to cache lines to minimize split_stores
6193 cmpptr(count, 96 << shift);
6194 jcc(Assembler::below, L_fill_64_bytes_loop);
6195
6196 // Find the bytes needed for alignment
6197 movptr(rtmp, to);
6198 andptr(rtmp, 0x1c);
6199 jcc(Assembler::zero, L_fill_64_bytes_loop);
6200 negptr(rtmp); // number of bytes to fill 32-rtmp. it filled by 2 mov by 32
6201 addptr(rtmp, 32);
6202 shrptr(rtmp, 2 - shift);// get number of elements from bytes
6203 subptr(count, rtmp); // adjust count by number of elements
6204
6205 align(16);
6206 BIND(L_align_64_bytes_loop);
6207 movdl(Address(to, 0), xtmp);
6208 addptr(to, 4);
6209 subptr(rtmp, 1 << shift);
6210 jcc(Assembler::greater, L_align_64_bytes_loop);
6211 }
6212
6213 align(16);
6214 BIND(L_fill_64_bytes_loop);
6215 vmovdqu(Address(to, 0), xtmp);
6216 vmovdqu(Address(to, 32), xtmp);
6217 addptr(to, 64);
6218 subptr(count, 16 << shift);
6219 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6220
6221 align(16);
6222 BIND(L_check_fill_32_bytes);
6223 addptr(count, 8 << shift);
6224 jccb(Assembler::less, L_check_fill_8_bytes);
6225 vmovdqu(Address(to, 0), xtmp);
6226 addptr(to, 32);
6227 subptr(count, 8 << shift);
6228
6229 BIND(L_check_fill_8_bytes);
6230 // clean upper bits of YMM registers
6231 movdl(xtmp, value);
6232 pshufd(xtmp, xtmp, 0);
6233 } else {
6234 // Fill 32-byte chunks
6235 pshufd(xtmp, xtmp, 0);
6236
6237 subptr(count, 8 << shift);
6238 jcc(Assembler::less, L_check_fill_8_bytes);
6239 align(16);
6240
6241 BIND(L_fill_32_bytes_loop);
6242
6243 if (UseUnalignedLoadStores) {
6244 movdqu(Address(to, 0), xtmp);
6245 movdqu(Address(to, 16), xtmp);
6246 } else {
6247 movq(Address(to, 0), xtmp);
6248 movq(Address(to, 8), xtmp);
6249 movq(Address(to, 16), xtmp);
6250 movq(Address(to, 24), xtmp);
6251 }
6252
6253 addptr(to, 32);
6254 subptr(count, 8 << shift);
6255 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6256
6257 BIND(L_check_fill_8_bytes);
6258 }
6259 addptr(count, 8 << shift);
6260 jccb(Assembler::zero, L_exit);
6261 jmpb(L_fill_8_bytes);
6262
6263 //
6264 // length is too short, just fill qwords
6265 //
6266 align(16);
6267 BIND(L_fill_8_bytes_loop);
6268 movq(Address(to, 0), xtmp);
6269 addptr(to, 8);
6270 BIND(L_fill_8_bytes);
6271 subptr(count, 1 << (shift + 1));
6272 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6273 }
6274 }
6275
6276 Label L_fill_4_bytes_loop;
6277 testl(count, 1 << shift);
6278 jccb(Assembler::zero, L_fill_2_bytes);
6279
6280 align(16);
6281 BIND(L_fill_4_bytes_loop);
6282 movl(Address(to, 0), value);
6283 addptr(to, 4);
6284
6285 BIND(L_fill_4_bytes);
6286 subptr(count, 1 << shift);
6287 jccb(Assembler::greaterEqual, L_fill_4_bytes_loop);
6288
6289 if (t == T_BYTE || t == T_SHORT) {
6290 Label L_fill_byte;
6291 BIND(L_fill_2_bytes);
6292 // fill trailing 2 bytes
6293 testl(count, 1<<(shift-1));
6294 jccb(Assembler::zero, L_fill_byte);
6295 movw(Address(to, 0), value);
6296 if (t == T_BYTE) {
6297 addptr(to, 2);
6298 BIND(L_fill_byte);
6299 // fill trailing byte
6300 testl(count, 1);
6301 jccb(Assembler::zero, L_exit);
6302 movb(Address(to, 0), value);
6303 } else {
6304 BIND(L_fill_byte);
6305 }
6306 } else {
6307 BIND(L_fill_2_bytes);
6308 }
6309 BIND(L_exit);
6310 }
6311
6312 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
6313 switch(type) {
6314 case T_BYTE:
6315 case T_BOOLEAN:
6316 evpbroadcastb(dst, src, vector_len);
6317 break;
6318 case T_SHORT:
6319 case T_CHAR:
6320 evpbroadcastw(dst, src, vector_len);
6321 break;
6322 case T_INT:
6323 case T_FLOAT:
6324 evpbroadcastd(dst, src, vector_len);
6325 break;
6326 case T_LONG:
6327 case T_DOUBLE:
6328 evpbroadcastq(dst, src, vector_len);
6329 break;
6330 default:
6331 fatal("Unhandled type : %s", type2name(type));
6332 break;
6333 }
6334 }
6335
6336 // Encode given char[]/byte[] to byte[] in ISO_8859_1 or ASCII
6337 //
6338 // @IntrinsicCandidate
6339 // int sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(
6340 // char[] sa, int sp, byte[] da, int dp, int len) {
6341 // int i = 0;
6342 // for (; i < len; i++) {
6343 // char c = sa[sp++];
6344 // if (c > '\u00FF')
6345 // break;
6346 // da[dp++] = (byte) c;
6347 // }
6348 // return i;
6349 // }
6350 //
6351 // @IntrinsicCandidate
6352 // int java.lang.StringCoding.encodeISOArray0(
6353 // byte[] sa, int sp, byte[] da, int dp, int len) {
6354 // int i = 0;
6355 // for (; i < len; i++) {
6356 // char c = StringUTF16.getChar(sa, sp++);
6357 // if (c > '\u00FF')
6358 // break;
6359 // da[dp++] = (byte) c;
6360 // }
6361 // return i;
6362 // }
6363 //
6364 // @IntrinsicCandidate
6365 // int java.lang.StringCoding.encodeAsciiArray0(
6366 // char[] sa, int sp, byte[] da, int dp, int len) {
6367 // int i = 0;
6368 // for (; i < len; i++) {
6369 // char c = sa[sp++];
6370 // if (c >= '\u0080')
6371 // break;
6372 // da[dp++] = (byte) c;
6373 // }
6374 // return i;
6375 // }
6376 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6377 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6378 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6379 Register tmp5, Register result, bool ascii) {
6380
6381 // rsi: src
6382 // rdi: dst
6383 // rdx: len
6384 // rcx: tmp5
6385 // rax: result
6386 ShortBranchVerifier sbv(this);
6387 assert_different_registers(src, dst, len, tmp5, result);
6388 Label L_done, L_copy_1_char, L_copy_1_char_exit;
6389
6390 int mask = ascii ? 0xff80ff80 : 0xff00ff00;
6391 int short_mask = ascii ? 0xff80 : 0xff00;
6392
6393 // set result
6394 xorl(result, result);
6395 // check for zero length
6396 testl(len, len);
6397 jcc(Assembler::zero, L_done);
6398
6399 movl(result, len);
6400
6401 // Setup pointers
6402 lea(src, Address(src, len, Address::times_2)); // char[]
6403 lea(dst, Address(dst, len, Address::times_1)); // byte[]
6404 negptr(len);
6405
6406 if (UseSSE42Intrinsics || UseAVX >= 2) {
6407 Label L_copy_8_chars, L_copy_8_chars_exit;
6408 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6409
6410 if (UseAVX >= 2) {
6411 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6412 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6413 movdl(tmp1Reg, tmp5);
6414 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
6415 jmp(L_chars_32_check);
6416
6417 bind(L_copy_32_chars);
6418 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6419 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6420 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6421 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6422 jccb(Assembler::notZero, L_copy_32_chars_exit);
6423 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6424 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
6425 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6426
6427 bind(L_chars_32_check);
6428 addptr(len, 32);
6429 jcc(Assembler::lessEqual, L_copy_32_chars);
6430
6431 bind(L_copy_32_chars_exit);
6432 subptr(len, 16);
6433 jccb(Assembler::greater, L_copy_16_chars_exit);
6434
6435 } else if (UseSSE42Intrinsics) {
6436 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6437 movdl(tmp1Reg, tmp5);
6438 pshufd(tmp1Reg, tmp1Reg, 0);
6439 jmpb(L_chars_16_check);
6440 }
6441
6442 bind(L_copy_16_chars);
6443 if (UseAVX >= 2) {
6444 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6445 vptest(tmp2Reg, tmp1Reg);
6446 jcc(Assembler::notZero, L_copy_16_chars_exit);
6447 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
6448 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
6449 } else {
6450 if (UseAVX > 0) {
6451 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6452 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6453 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
6454 } else {
6455 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6456 por(tmp2Reg, tmp3Reg);
6457 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6458 por(tmp2Reg, tmp4Reg);
6459 }
6460 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6461 jccb(Assembler::notZero, L_copy_16_chars_exit);
6462 packuswb(tmp3Reg, tmp4Reg);
6463 }
6464 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6465
6466 bind(L_chars_16_check);
6467 addptr(len, 16);
6468 jcc(Assembler::lessEqual, L_copy_16_chars);
6469
6470 bind(L_copy_16_chars_exit);
6471 if (UseAVX >= 2) {
6472 // clean upper bits of YMM registers
6473 vpxor(tmp2Reg, tmp2Reg);
6474 vpxor(tmp3Reg, tmp3Reg);
6475 vpxor(tmp4Reg, tmp4Reg);
6476 movdl(tmp1Reg, tmp5);
6477 pshufd(tmp1Reg, tmp1Reg, 0);
6478 }
6479 subptr(len, 8);
6480 jccb(Assembler::greater, L_copy_8_chars_exit);
6481
6482 bind(L_copy_8_chars);
6483 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6484 ptest(tmp3Reg, tmp1Reg);
6485 jccb(Assembler::notZero, L_copy_8_chars_exit);
6486 packuswb(tmp3Reg, tmp1Reg);
6487 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6488 addptr(len, 8);
6489 jccb(Assembler::lessEqual, L_copy_8_chars);
6490
6491 bind(L_copy_8_chars_exit);
6492 subptr(len, 8);
6493 jccb(Assembler::zero, L_done);
6494 }
6495
6496 bind(L_copy_1_char);
6497 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6498 testl(tmp5, short_mask); // check if Unicode or non-ASCII char
6499 jccb(Assembler::notZero, L_copy_1_char_exit);
6500 movb(Address(dst, len, Address::times_1, 0), tmp5);
6501 addptr(len, 1);
6502 jccb(Assembler::less, L_copy_1_char);
6503
6504 bind(L_copy_1_char_exit);
6505 addptr(result, len); // len is negative count of not processed elements
6506
6507 bind(L_done);
6508 }
6509
6510 /**
6511 * Helper for multiply_to_len().
6512 */
6513 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6514 addq(dest_lo, src1);
6515 adcq(dest_hi, 0);
6516 addq(dest_lo, src2);
6517 adcq(dest_hi, 0);
6518 }
6519
6520 /**
6521 * Multiply 64 bit by 64 bit first loop.
6522 */
6523 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6524 Register y, Register y_idx, Register z,
6525 Register carry, Register product,
6526 Register idx, Register kdx) {
6527 //
6528 // jlong carry, x[], y[], z[];
6529 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6530 // huge_128 product = y[idx] * x[xstart] + carry;
6531 // z[kdx] = (jlong)product;
6532 // carry = (jlong)(product >>> 64);
6533 // }
6534 // z[xstart] = carry;
6535 //
6536
6537 Label L_first_loop, L_first_loop_exit;
6538 Label L_one_x, L_one_y, L_multiply;
6539
6540 decrementl(xstart);
6541 jcc(Assembler::negative, L_one_x);
6542
6543 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6544 rorq(x_xstart, 32); // convert big-endian to little-endian
6545
6546 bind(L_first_loop);
6547 decrementl(idx);
6548 jcc(Assembler::negative, L_first_loop_exit);
6549 decrementl(idx);
6550 jcc(Assembler::negative, L_one_y);
6551 movq(y_idx, Address(y, idx, Address::times_4, 0));
6552 rorq(y_idx, 32); // convert big-endian to little-endian
6553 bind(L_multiply);
6554 movq(product, x_xstart);
6555 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
6556 addq(product, carry);
6557 adcq(rdx, 0);
6558 subl(kdx, 2);
6559 movl(Address(z, kdx, Address::times_4, 4), product);
6560 shrq(product, 32);
6561 movl(Address(z, kdx, Address::times_4, 0), product);
6562 movq(carry, rdx);
6563 jmp(L_first_loop);
6564
6565 bind(L_one_y);
6566 movl(y_idx, Address(y, 0));
6567 jmp(L_multiply);
6568
6569 bind(L_one_x);
6570 movl(x_xstart, Address(x, 0));
6571 jmp(L_first_loop);
6572
6573 bind(L_first_loop_exit);
6574 }
6575
6576 /**
6577 * Multiply 64 bit by 64 bit and add 128 bit.
6578 */
6579 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
6580 Register yz_idx, Register idx,
6581 Register carry, Register product, int offset) {
6582 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
6583 // z[kdx] = (jlong)product;
6584
6585 movq(yz_idx, Address(y, idx, Address::times_4, offset));
6586 rorq(yz_idx, 32); // convert big-endian to little-endian
6587 movq(product, x_xstart);
6588 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6589 movq(yz_idx, Address(z, idx, Address::times_4, offset));
6590 rorq(yz_idx, 32); // convert big-endian to little-endian
6591
6592 add2_with_carry(rdx, product, carry, yz_idx);
6593
6594 movl(Address(z, idx, Address::times_4, offset+4), product);
6595 shrq(product, 32);
6596 movl(Address(z, idx, Address::times_4, offset), product);
6597
6598 }
6599
6600 /**
6601 * Multiply 128 bit by 128 bit. Unrolled inner loop.
6602 */
6603 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
6604 Register yz_idx, Register idx, Register jdx,
6605 Register carry, Register product,
6606 Register carry2) {
6607 // jlong carry, x[], y[], z[];
6608 // int kdx = ystart+1;
6609 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6610 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
6611 // z[kdx+idx+1] = (jlong)product;
6612 // jlong carry2 = (jlong)(product >>> 64);
6613 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6614 // z[kdx+idx] = (jlong)product;
6615 // carry = (jlong)(product >>> 64);
6616 // }
6617 // idx += 2;
6618 // if (idx > 0) {
6619 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6620 // z[kdx+idx] = (jlong)product;
6621 // carry = (jlong)(product >>> 64);
6622 // }
6623 //
6624
6625 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6626
6627 movl(jdx, idx);
6628 andl(jdx, 0xFFFFFFFC);
6629 shrl(jdx, 2);
6630
6631 bind(L_third_loop);
6632 subl(jdx, 1);
6633 jcc(Assembler::negative, L_third_loop_exit);
6634 subl(idx, 4);
6635
6636 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6637 movq(carry2, rdx);
6638
6639 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6640 movq(carry, rdx);
6641 jmp(L_third_loop);
6642
6643 bind (L_third_loop_exit);
6644
6645 andl (idx, 0x3);
6646 jcc(Assembler::zero, L_post_third_loop_done);
6647
6648 Label L_check_1;
6649 subl(idx, 2);
6650 jcc(Assembler::negative, L_check_1);
6651
6652 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6653 movq(carry, rdx);
6654
6655 bind (L_check_1);
6656 addl (idx, 0x2);
6657 andl (idx, 0x1);
6658 subl(idx, 1);
6659 jcc(Assembler::negative, L_post_third_loop_done);
6660
6661 movl(yz_idx, Address(y, idx, Address::times_4, 0));
6662 movq(product, x_xstart);
6663 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6664 movl(yz_idx, Address(z, idx, Address::times_4, 0));
6665
6666 add2_with_carry(rdx, product, yz_idx, carry);
6667
6668 movl(Address(z, idx, Address::times_4, 0), product);
6669 shrq(product, 32);
6670
6671 shlq(rdx, 32);
6672 orq(product, rdx);
6673 movq(carry, product);
6674
6675 bind(L_post_third_loop_done);
6676 }
6677
6678 /**
6679 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6680 *
6681 */
6682 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6683 Register carry, Register carry2,
6684 Register idx, Register jdx,
6685 Register yz_idx1, Register yz_idx2,
6686 Register tmp, Register tmp3, Register tmp4) {
6687 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6688
6689 // jlong carry, x[], y[], z[];
6690 // int kdx = ystart+1;
6691 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6692 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6693 // jlong carry2 = (jlong)(tmp3 >>> 64);
6694 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
6695 // carry = (jlong)(tmp4 >>> 64);
6696 // z[kdx+idx+1] = (jlong)tmp3;
6697 // z[kdx+idx] = (jlong)tmp4;
6698 // }
6699 // idx += 2;
6700 // if (idx > 0) {
6701 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6702 // z[kdx+idx] = (jlong)yz_idx1;
6703 // carry = (jlong)(yz_idx1 >>> 64);
6704 // }
6705 //
6706
6707 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6708
6709 movl(jdx, idx);
6710 andl(jdx, 0xFFFFFFFC);
6711 shrl(jdx, 2);
6712
6713 bind(L_third_loop);
6714 subl(jdx, 1);
6715 jcc(Assembler::negative, L_third_loop_exit);
6716 subl(idx, 4);
6717
6718 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
6719 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6720 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
6721 rorxq(yz_idx2, yz_idx2, 32);
6722
6723 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6724 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
6725
6726 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
6727 rorxq(yz_idx1, yz_idx1, 32);
6728 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6729 rorxq(yz_idx2, yz_idx2, 32);
6730
6731 if (VM_Version::supports_adx()) {
6732 adcxq(tmp3, carry);
6733 adoxq(tmp3, yz_idx1);
6734
6735 adcxq(tmp4, tmp);
6736 adoxq(tmp4, yz_idx2);
6737
6738 movl(carry, 0); // does not affect flags
6739 adcxq(carry2, carry);
6740 adoxq(carry2, carry);
6741 } else {
6742 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6743 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6744 }
6745 movq(carry, carry2);
6746
6747 movl(Address(z, idx, Address::times_4, 12), tmp3);
6748 shrq(tmp3, 32);
6749 movl(Address(z, idx, Address::times_4, 8), tmp3);
6750
6751 movl(Address(z, idx, Address::times_4, 4), tmp4);
6752 shrq(tmp4, 32);
6753 movl(Address(z, idx, Address::times_4, 0), tmp4);
6754
6755 jmp(L_third_loop);
6756
6757 bind (L_third_loop_exit);
6758
6759 andl (idx, 0x3);
6760 jcc(Assembler::zero, L_post_third_loop_done);
6761
6762 Label L_check_1;
6763 subl(idx, 2);
6764 jcc(Assembler::negative, L_check_1);
6765
6766 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
6767 rorxq(yz_idx1, yz_idx1, 32);
6768 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6769 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6770 rorxq(yz_idx2, yz_idx2, 32);
6771
6772 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6773
6774 movl(Address(z, idx, Address::times_4, 4), tmp3);
6775 shrq(tmp3, 32);
6776 movl(Address(z, idx, Address::times_4, 0), tmp3);
6777 movq(carry, tmp4);
6778
6779 bind (L_check_1);
6780 addl (idx, 0x2);
6781 andl (idx, 0x1);
6782 subl(idx, 1);
6783 jcc(Assembler::negative, L_post_third_loop_done);
6784 movl(tmp4, Address(y, idx, Address::times_4, 0));
6785 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
6786 movl(tmp4, Address(z, idx, Address::times_4, 0));
6787
6788 add2_with_carry(carry2, tmp3, tmp4, carry);
6789
6790 movl(Address(z, idx, Address::times_4, 0), tmp3);
6791 shrq(tmp3, 32);
6792
6793 shlq(carry2, 32);
6794 orq(tmp3, carry2);
6795 movq(carry, tmp3);
6796
6797 bind(L_post_third_loop_done);
6798 }
6799
6800 /**
6801 * Code for BigInteger::multiplyToLen() intrinsic.
6802 *
6803 * rdi: x
6804 * rax: xlen
6805 * rsi: y
6806 * rcx: ylen
6807 * r8: z
6808 * r11: tmp0
6809 * r12: tmp1
6810 * r13: tmp2
6811 * r14: tmp3
6812 * r15: tmp4
6813 * rbx: tmp5
6814 *
6815 */
6816 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
6817 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6818 ShortBranchVerifier sbv(this);
6819 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6820
6821 push(tmp0);
6822 push(tmp1);
6823 push(tmp2);
6824 push(tmp3);
6825 push(tmp4);
6826 push(tmp5);
6827
6828 push(xlen);
6829
6830 const Register idx = tmp1;
6831 const Register kdx = tmp2;
6832 const Register xstart = tmp3;
6833
6834 const Register y_idx = tmp4;
6835 const Register carry = tmp5;
6836 const Register product = xlen;
6837 const Register x_xstart = tmp0;
6838
6839 // First Loop.
6840 //
6841 // final static long LONG_MASK = 0xffffffffL;
6842 // int xstart = xlen - 1;
6843 // int ystart = ylen - 1;
6844 // long carry = 0;
6845 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6846 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6847 // z[kdx] = (int)product;
6848 // carry = product >>> 32;
6849 // }
6850 // z[xstart] = (int)carry;
6851 //
6852
6853 movl(idx, ylen); // idx = ylen;
6854 lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen;
6855 xorq(carry, carry); // carry = 0;
6856
6857 Label L_done;
6858
6859 movl(xstart, xlen);
6860 decrementl(xstart);
6861 jcc(Assembler::negative, L_done);
6862
6863 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6864
6865 Label L_second_loop;
6866 testl(kdx, kdx);
6867 jcc(Assembler::zero, L_second_loop);
6868
6869 Label L_carry;
6870 subl(kdx, 1);
6871 jcc(Assembler::zero, L_carry);
6872
6873 movl(Address(z, kdx, Address::times_4, 0), carry);
6874 shrq(carry, 32);
6875 subl(kdx, 1);
6876
6877 bind(L_carry);
6878 movl(Address(z, kdx, Address::times_4, 0), carry);
6879
6880 // Second and third (nested) loops.
6881 //
6882 // for (int i = xstart-1; i >= 0; i--) { // Second loop
6883 // carry = 0;
6884 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6885 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6886 // (z[k] & LONG_MASK) + carry;
6887 // z[k] = (int)product;
6888 // carry = product >>> 32;
6889 // }
6890 // z[i] = (int)carry;
6891 // }
6892 //
6893 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6894
6895 const Register jdx = tmp1;
6896
6897 bind(L_second_loop);
6898 xorl(carry, carry); // carry = 0;
6899 movl(jdx, ylen); // j = ystart+1
6900
6901 subl(xstart, 1); // i = xstart-1;
6902 jcc(Assembler::negative, L_done);
6903
6904 push (z);
6905
6906 Label L_last_x;
6907 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6908 subl(xstart, 1); // i = xstart-1;
6909 jcc(Assembler::negative, L_last_x);
6910
6911 if (UseBMI2Instructions) {
6912 movq(rdx, Address(x, xstart, Address::times_4, 0));
6913 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6914 } else {
6915 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6916 rorq(x_xstart, 32); // convert big-endian to little-endian
6917 }
6918
6919 Label L_third_loop_prologue;
6920 bind(L_third_loop_prologue);
6921
6922 push (x);
6923 push (xstart);
6924 push (ylen);
6925
6926
6927 if (UseBMI2Instructions) {
6928 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6929 } else { // !UseBMI2Instructions
6930 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6931 }
6932
6933 pop(ylen);
6934 pop(xlen);
6935 pop(x);
6936 pop(z);
6937
6938 movl(tmp3, xlen);
6939 addl(tmp3, 1);
6940 movl(Address(z, tmp3, Address::times_4, 0), carry);
6941 subl(tmp3, 1);
6942 jccb(Assembler::negative, L_done);
6943
6944 shrq(carry, 32);
6945 movl(Address(z, tmp3, Address::times_4, 0), carry);
6946 jmp(L_second_loop);
6947
6948 // Next infrequent code is moved outside loops.
6949 bind(L_last_x);
6950 if (UseBMI2Instructions) {
6951 movl(rdx, Address(x, 0));
6952 } else {
6953 movl(x_xstart, Address(x, 0));
6954 }
6955 jmp(L_third_loop_prologue);
6956
6957 bind(L_done);
6958
6959 pop(xlen);
6960
6961 pop(tmp5);
6962 pop(tmp4);
6963 pop(tmp3);
6964 pop(tmp2);
6965 pop(tmp1);
6966 pop(tmp0);
6967 }
6968
6969 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6970 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6971 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6972 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6973 Label VECTOR8_TAIL, VECTOR4_TAIL;
6974 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6975 Label SAME_TILL_END, DONE;
6976 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6977
6978 //scale is in rcx in both Win64 and Unix
6979 ShortBranchVerifier sbv(this);
6980
6981 shlq(length);
6982 xorq(result, result);
6983
6984 if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6985 VM_Version::supports_avx512vlbw() && UseCountTrailingZerosInstruction) {
6986 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6987
6988 cmpq(length, 64);
6989 jcc(Assembler::less, VECTOR32_TAIL);
6990
6991 movq(tmp1, length);
6992 andq(tmp1, 0x3F); // tail count
6993 andq(length, ~(0x3F)); //vector count
6994
6995 bind(VECTOR64_LOOP);
6996 // AVX512 code to compare 64 byte vectors.
6997 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
6998 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6999 kortestql(k7, k7);
7000 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
7001 addq(result, 64);
7002 subq(length, 64);
7003 jccb(Assembler::notZero, VECTOR64_LOOP);
7004
7005 //bind(VECTOR64_TAIL);
7006 testq(tmp1, tmp1);
7007 jcc(Assembler::zero, SAME_TILL_END);
7008
7009 //bind(VECTOR64_TAIL);
7010 // AVX512 code to compare up to 63 byte vectors.
7011 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
7012 shlxq(tmp2, tmp2, tmp1);
7013 notq(tmp2);
7014 kmovql(k3, tmp2);
7015
7016 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
7017 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
7018
7019 ktestql(k7, k3);
7020 jcc(Assembler::below, SAME_TILL_END); // not mismatch
7021
7022 bind(VECTOR64_NOT_EQUAL);
7023 kmovql(tmp1, k7);
7024 notq(tmp1);
7025 tzcntq(tmp1, tmp1);
7026 addq(result, tmp1);
7027 shrq(result);
7028 jmp(DONE);
7029 bind(VECTOR32_TAIL);
7030 }
7031
7032 cmpq(length, 8);
7033 jcc(Assembler::equal, VECTOR8_LOOP);
7034 jcc(Assembler::less, VECTOR4_TAIL);
7035
7036 if (UseAVX >= 2) {
7037 Label VECTOR16_TAIL, VECTOR32_LOOP;
7038
7039 cmpq(length, 16);
7040 jcc(Assembler::equal, VECTOR16_LOOP);
7041 jcc(Assembler::less, VECTOR8_LOOP);
7042
7043 cmpq(length, 32);
7044 jccb(Assembler::less, VECTOR16_TAIL);
7045
7046 subq(length, 32);
7047 bind(VECTOR32_LOOP);
7048 vmovdqu(rymm0, Address(obja, result));
7049 vmovdqu(rymm1, Address(objb, result));
7050 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
7051 vptest(rymm2, rymm2);
7052 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
7053 addq(result, 32);
7054 subq(length, 32);
7055 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
7056 addq(length, 32);
7057 jcc(Assembler::equal, SAME_TILL_END);
7058 //falling through if less than 32 bytes left //close the branch here.
7059
7060 bind(VECTOR16_TAIL);
7061 cmpq(length, 16);
7062 jccb(Assembler::less, VECTOR8_TAIL);
7063 bind(VECTOR16_LOOP);
7064 movdqu(rymm0, Address(obja, result));
7065 movdqu(rymm1, Address(objb, result));
7066 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
7067 ptest(rymm2, rymm2);
7068 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7069 addq(result, 16);
7070 subq(length, 16);
7071 jcc(Assembler::equal, SAME_TILL_END);
7072 //falling through if less than 16 bytes left
7073 } else {//regular intrinsics
7074
7075 cmpq(length, 16);
7076 jccb(Assembler::less, VECTOR8_TAIL);
7077
7078 subq(length, 16);
7079 bind(VECTOR16_LOOP);
7080 movdqu(rymm0, Address(obja, result));
7081 movdqu(rymm1, Address(objb, result));
7082 pxor(rymm0, rymm1);
7083 ptest(rymm0, rymm0);
7084 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7085 addq(result, 16);
7086 subq(length, 16);
7087 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
7088 addq(length, 16);
7089 jcc(Assembler::equal, SAME_TILL_END);
7090 //falling through if less than 16 bytes left
7091 }
7092
7093 bind(VECTOR8_TAIL);
7094 cmpq(length, 8);
7095 jccb(Assembler::less, VECTOR4_TAIL);
7096 bind(VECTOR8_LOOP);
7097 movq(tmp1, Address(obja, result));
7098 movq(tmp2, Address(objb, result));
7099 xorq(tmp1, tmp2);
7100 testq(tmp1, tmp1);
7101 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
7102 addq(result, 8);
7103 subq(length, 8);
7104 jcc(Assembler::equal, SAME_TILL_END);
7105 //falling through if less than 8 bytes left
7106
7107 bind(VECTOR4_TAIL);
7108 cmpq(length, 4);
7109 jccb(Assembler::less, BYTES_TAIL);
7110 bind(VECTOR4_LOOP);
7111 movl(tmp1, Address(obja, result));
7112 xorl(tmp1, Address(objb, result));
7113 testl(tmp1, tmp1);
7114 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
7115 addq(result, 4);
7116 subq(length, 4);
7117 jcc(Assembler::equal, SAME_TILL_END);
7118 //falling through if less than 4 bytes left
7119
7120 bind(BYTES_TAIL);
7121 bind(BYTES_LOOP);
7122 load_unsigned_byte(tmp1, Address(obja, result));
7123 load_unsigned_byte(tmp2, Address(objb, result));
7124 xorl(tmp1, tmp2);
7125 testl(tmp1, tmp1);
7126 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7127 decq(length);
7128 jcc(Assembler::zero, SAME_TILL_END);
7129 incq(result);
7130 load_unsigned_byte(tmp1, Address(obja, result));
7131 load_unsigned_byte(tmp2, Address(objb, result));
7132 xorl(tmp1, tmp2);
7133 testl(tmp1, tmp1);
7134 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7135 decq(length);
7136 jcc(Assembler::zero, SAME_TILL_END);
7137 incq(result);
7138 load_unsigned_byte(tmp1, Address(obja, result));
7139 load_unsigned_byte(tmp2, Address(objb, result));
7140 xorl(tmp1, tmp2);
7141 testl(tmp1, tmp1);
7142 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7143 jmp(SAME_TILL_END);
7144
7145 if (UseAVX >= 2) {
7146 bind(VECTOR32_NOT_EQUAL);
7147 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
7148 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
7149 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
7150 vpmovmskb(tmp1, rymm0);
7151 bsfq(tmp1, tmp1);
7152 addq(result, tmp1);
7153 shrq(result);
7154 jmp(DONE);
7155 }
7156
7157 bind(VECTOR16_NOT_EQUAL);
7158 if (UseAVX >= 2) {
7159 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
7160 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
7161 pxor(rymm0, rymm2);
7162 } else {
7163 pcmpeqb(rymm2, rymm2);
7164 pxor(rymm0, rymm1);
7165 pcmpeqb(rymm0, rymm1);
7166 pxor(rymm0, rymm2);
7167 }
7168 pmovmskb(tmp1, rymm0);
7169 bsfq(tmp1, tmp1);
7170 addq(result, tmp1);
7171 shrq(result);
7172 jmpb(DONE);
7173
7174 bind(VECTOR8_NOT_EQUAL);
7175 bind(VECTOR4_NOT_EQUAL);
7176 bsfq(tmp1, tmp1);
7177 shrq(tmp1, 3);
7178 addq(result, tmp1);
7179 bind(BYTES_NOT_EQUAL);
7180 shrq(result);
7181 jmpb(DONE);
7182
7183 bind(SAME_TILL_END);
7184 mov64(result, -1);
7185
7186 bind(DONE);
7187 }
7188
7189 //Helper functions for square_to_len()
7190
7191 /**
7192 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7193 * Preserves x and z and modifies rest of the registers.
7194 */
7195 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7196 // Perform square and right shift by 1
7197 // Handle odd xlen case first, then for even xlen do the following
7198 // jlong carry = 0;
7199 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7200 // huge_128 product = x[j:j+1] * x[j:j+1];
7201 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7202 // z[i+2:i+3] = (jlong)(product >>> 1);
7203 // carry = (jlong)product;
7204 // }
7205
7206 xorq(tmp5, tmp5); // carry
7207 xorq(rdxReg, rdxReg);
7208 xorl(tmp1, tmp1); // index for x
7209 xorl(tmp4, tmp4); // index for z
7210
7211 Label L_first_loop, L_first_loop_exit;
7212
7213 testl(xlen, 1);
7214 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7215
7216 // Square and right shift by 1 the odd element using 32 bit multiply
7217 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7218 imulq(raxReg, raxReg);
7219 shrq(raxReg, 1);
7220 adcq(tmp5, 0);
7221 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7222 incrementl(tmp1);
7223 addl(tmp4, 2);
7224
7225 // Square and right shift by 1 the rest using 64 bit multiply
7226 bind(L_first_loop);
7227 cmpptr(tmp1, xlen);
7228 jccb(Assembler::equal, L_first_loop_exit);
7229
7230 // Square
7231 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
7232 rorq(raxReg, 32); // convert big-endian to little-endian
7233 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
7234
7235 // Right shift by 1 and save carry
7236 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7237 rcrq(rdxReg, 1);
7238 rcrq(raxReg, 1);
7239 adcq(tmp5, 0);
7240
7241 // Store result in z
7242 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7243 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7244
7245 // Update indices for x and z
7246 addl(tmp1, 2);
7247 addl(tmp4, 4);
7248 jmp(L_first_loop);
7249
7250 bind(L_first_loop_exit);
7251 }
7252
7253
7254 /**
7255 * Perform the following multiply add operation using BMI2 instructions
7256 * carry:sum = sum + op1*op2 + carry
7257 * op2 should be in rdx
7258 * op2 is preserved, all other registers are modified
7259 */
7260 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7261 // assert op2 is rdx
7262 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
7263 addq(sum, carry);
7264 adcq(tmp2, 0);
7265 addq(sum, op1);
7266 adcq(tmp2, 0);
7267 movq(carry, tmp2);
7268 }
7269
7270 /**
7271 * Perform the following multiply add operation:
7272 * carry:sum = sum + op1*op2 + carry
7273 * Preserves op1, op2 and modifies rest of registers
7274 */
7275 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7276 // rdx:rax = op1 * op2
7277 movq(raxReg, op2);
7278 mulq(op1);
7279
7280 // rdx:rax = sum + carry + rdx:rax
7281 addq(sum, carry);
7282 adcq(rdxReg, 0);
7283 addq(sum, raxReg);
7284 adcq(rdxReg, 0);
7285
7286 // carry:sum = rdx:sum
7287 movq(carry, rdxReg);
7288 }
7289
7290 /**
7291 * Add 64 bit long carry into z[] with carry propagation.
7292 * Preserves z and carry register values and modifies rest of registers.
7293 *
7294 */
7295 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7296 Label L_fourth_loop, L_fourth_loop_exit;
7297
7298 movl(tmp1, 1);
7299 subl(zlen, 2);
7300 addq(Address(z, zlen, Address::times_4, 0), carry);
7301
7302 bind(L_fourth_loop);
7303 jccb(Assembler::carryClear, L_fourth_loop_exit);
7304 subl(zlen, 2);
7305 jccb(Assembler::negative, L_fourth_loop_exit);
7306 addq(Address(z, zlen, Address::times_4, 0), tmp1);
7307 jmp(L_fourth_loop);
7308 bind(L_fourth_loop_exit);
7309 }
7310
7311 /**
7312 * Shift z[] left by 1 bit.
7313 * Preserves x, len, z and zlen registers and modifies rest of the registers.
7314 *
7315 */
7316 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7317
7318 Label L_fifth_loop, L_fifth_loop_exit;
7319
7320 // Fifth loop
7321 // Perform primitiveLeftShift(z, zlen, 1)
7322
7323 const Register prev_carry = tmp1;
7324 const Register new_carry = tmp4;
7325 const Register value = tmp2;
7326 const Register zidx = tmp3;
7327
7328 // int zidx, carry;
7329 // long value;
7330 // carry = 0;
7331 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7332 // (carry:value) = (z[i] << 1) | carry ;
7333 // z[i] = value;
7334 // }
7335
7336 movl(zidx, zlen);
7337 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7338
7339 bind(L_fifth_loop);
7340 decl(zidx); // Use decl to preserve carry flag
7341 decl(zidx);
7342 jccb(Assembler::negative, L_fifth_loop_exit);
7343
7344 if (UseBMI2Instructions) {
7345 movq(value, Address(z, zidx, Address::times_4, 0));
7346 rclq(value, 1);
7347 rorxq(value, value, 32);
7348 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7349 }
7350 else {
7351 // clear new_carry
7352 xorl(new_carry, new_carry);
7353
7354 // Shift z[i] by 1, or in previous carry and save new carry
7355 movq(value, Address(z, zidx, Address::times_4, 0));
7356 shlq(value, 1);
7357 adcl(new_carry, 0);
7358
7359 orq(value, prev_carry);
7360 rorq(value, 0x20);
7361 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7362
7363 // Set previous carry = new carry
7364 movl(prev_carry, new_carry);
7365 }
7366 jmp(L_fifth_loop);
7367
7368 bind(L_fifth_loop_exit);
7369 }
7370
7371
7372 /**
7373 * Code for BigInteger::squareToLen() intrinsic
7374 *
7375 * rdi: x
7376 * rsi: len
7377 * r8: z
7378 * rcx: zlen
7379 * r12: tmp1
7380 * r13: tmp2
7381 * r14: tmp3
7382 * r15: tmp4
7383 * rbx: tmp5
7384 *
7385 */
7386 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7387
7388 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
7389 push(tmp1);
7390 push(tmp2);
7391 push(tmp3);
7392 push(tmp4);
7393 push(tmp5);
7394
7395 // First loop
7396 // Store the squares, right shifted one bit (i.e., divided by 2).
7397 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7398
7399 // Add in off-diagonal sums.
7400 //
7401 // Second, third (nested) and fourth loops.
7402 // zlen +=2;
7403 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7404 // carry = 0;
7405 // long op2 = x[xidx:xidx+1];
7406 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7407 // k -= 2;
7408 // long op1 = x[j:j+1];
7409 // long sum = z[k:k+1];
7410 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7411 // z[k:k+1] = sum;
7412 // }
7413 // add_one_64(z, k, carry, tmp_regs);
7414 // }
7415
7416 const Register carry = tmp5;
7417 const Register sum = tmp3;
7418 const Register op1 = tmp4;
7419 Register op2 = tmp2;
7420
7421 push(zlen);
7422 push(len);
7423 addl(zlen,2);
7424 bind(L_second_loop);
7425 xorq(carry, carry);
7426 subl(zlen, 4);
7427 subl(len, 2);
7428 push(zlen);
7429 push(len);
7430 cmpl(len, 0);
7431 jccb(Assembler::lessEqual, L_second_loop_exit);
7432
7433 // Multiply an array by one 64 bit long.
7434 if (UseBMI2Instructions) {
7435 op2 = rdxReg;
7436 movq(op2, Address(x, len, Address::times_4, 0));
7437 rorxq(op2, op2, 32);
7438 }
7439 else {
7440 movq(op2, Address(x, len, Address::times_4, 0));
7441 rorq(op2, 32);
7442 }
7443
7444 bind(L_third_loop);
7445 decrementl(len);
7446 jccb(Assembler::negative, L_third_loop_exit);
7447 decrementl(len);
7448 jccb(Assembler::negative, L_last_x);
7449
7450 movq(op1, Address(x, len, Address::times_4, 0));
7451 rorq(op1, 32);
7452
7453 bind(L_multiply);
7454 subl(zlen, 2);
7455 movq(sum, Address(z, zlen, Address::times_4, 0));
7456
7457 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
7458 if (UseBMI2Instructions) {
7459 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
7460 }
7461 else {
7462 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7463 }
7464
7465 movq(Address(z, zlen, Address::times_4, 0), sum);
7466
7467 jmp(L_third_loop);
7468 bind(L_third_loop_exit);
7469
7470 // Fourth loop
7471 // Add 64 bit long carry into z with carry propagation.
7472 // Uses offsetted zlen.
7473 add_one_64(z, zlen, carry, tmp1);
7474
7475 pop(len);
7476 pop(zlen);
7477 jmp(L_second_loop);
7478
7479 // Next infrequent code is moved outside loops.
7480 bind(L_last_x);
7481 movl(op1, Address(x, 0));
7482 jmp(L_multiply);
7483
7484 bind(L_second_loop_exit);
7485 pop(len);
7486 pop(zlen);
7487 pop(len);
7488 pop(zlen);
7489
7490 // Fifth loop
7491 // Shift z left 1 bit.
7492 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7493
7494 // z[zlen-1] |= x[len-1] & 1;
7495 movl(tmp3, Address(x, len, Address::times_4, -4));
7496 andl(tmp3, 1);
7497 orl(Address(z, zlen, Address::times_4, -4), tmp3);
7498
7499 pop(tmp5);
7500 pop(tmp4);
7501 pop(tmp3);
7502 pop(tmp2);
7503 pop(tmp1);
7504 }
7505
7506 /**
7507 * Helper function for mul_add()
7508 * Multiply the in[] by int k and add to out[] starting at offset offs using
7509 * 128 bit by 32 bit multiply and return the carry in tmp5.
7510 * Only quad int aligned length of in[] is operated on in this function.
7511 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7512 * This function preserves out, in and k registers.
7513 * len and offset point to the appropriate index in "in" & "out" correspondingly
7514 * tmp5 has the carry.
7515 * other registers are temporary and are modified.
7516 *
7517 */
7518 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7519 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7520 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7521
7522 Label L_first_loop, L_first_loop_exit;
7523
7524 movl(tmp1, len);
7525 shrl(tmp1, 2);
7526
7527 bind(L_first_loop);
7528 subl(tmp1, 1);
7529 jccb(Assembler::negative, L_first_loop_exit);
7530
7531 subl(len, 4);
7532 subl(offset, 4);
7533
7534 Register op2 = tmp2;
7535 const Register sum = tmp3;
7536 const Register op1 = tmp4;
7537 const Register carry = tmp5;
7538
7539 if (UseBMI2Instructions) {
7540 op2 = rdxReg;
7541 }
7542
7543 movq(op1, Address(in, len, Address::times_4, 8));
7544 rorq(op1, 32);
7545 movq(sum, Address(out, offset, Address::times_4, 8));
7546 rorq(sum, 32);
7547 if (UseBMI2Instructions) {
7548 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7549 }
7550 else {
7551 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7552 }
7553 // Store back in big endian from little endian
7554 rorq(sum, 0x20);
7555 movq(Address(out, offset, Address::times_4, 8), sum);
7556
7557 movq(op1, Address(in, len, Address::times_4, 0));
7558 rorq(op1, 32);
7559 movq(sum, Address(out, offset, Address::times_4, 0));
7560 rorq(sum, 32);
7561 if (UseBMI2Instructions) {
7562 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7563 }
7564 else {
7565 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7566 }
7567 // Store back in big endian from little endian
7568 rorq(sum, 0x20);
7569 movq(Address(out, offset, Address::times_4, 0), sum);
7570
7571 jmp(L_first_loop);
7572 bind(L_first_loop_exit);
7573 }
7574
7575 /**
7576 * Code for BigInteger::mulAdd() intrinsic
7577 *
7578 * rdi: out
7579 * rsi: in
7580 * r11: offs (out.length - offset)
7581 * rcx: len
7582 * r8: k
7583 * r12: tmp1
7584 * r13: tmp2
7585 * r14: tmp3
7586 * r15: tmp4
7587 * rbx: tmp5
7588 * Multiply the in[] by word k and add to out[], return the carry in rax
7589 */
7590 void MacroAssembler::mul_add(Register out, Register in, Register offs,
7591 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
7592 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7593
7594 Label L_carry, L_last_in, L_done;
7595
7596 // carry = 0;
7597 // for (int j=len-1; j >= 0; j--) {
7598 // long product = (in[j] & LONG_MASK) * kLong +
7599 // (out[offs] & LONG_MASK) + carry;
7600 // out[offs--] = (int)product;
7601 // carry = product >>> 32;
7602 // }
7603 //
7604 push(tmp1);
7605 push(tmp2);
7606 push(tmp3);
7607 push(tmp4);
7608 push(tmp5);
7609
7610 Register op2 = tmp2;
7611 const Register sum = tmp3;
7612 const Register op1 = tmp4;
7613 const Register carry = tmp5;
7614
7615 if (UseBMI2Instructions) {
7616 op2 = rdxReg;
7617 movl(op2, k);
7618 }
7619 else {
7620 movl(op2, k);
7621 }
7622
7623 xorq(carry, carry);
7624
7625 //First loop
7626
7627 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7628 //The carry is in tmp5
7629 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7630
7631 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7632 decrementl(len);
7633 jccb(Assembler::negative, L_carry);
7634 decrementl(len);
7635 jccb(Assembler::negative, L_last_in);
7636
7637 movq(op1, Address(in, len, Address::times_4, 0));
7638 rorq(op1, 32);
7639
7640 subl(offs, 2);
7641 movq(sum, Address(out, offs, Address::times_4, 0));
7642 rorq(sum, 32);
7643
7644 if (UseBMI2Instructions) {
7645 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7646 }
7647 else {
7648 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7649 }
7650
7651 // Store back in big endian from little endian
7652 rorq(sum, 0x20);
7653 movq(Address(out, offs, Address::times_4, 0), sum);
7654
7655 testl(len, len);
7656 jccb(Assembler::zero, L_carry);
7657
7658 //Multiply the last in[] entry, if any
7659 bind(L_last_in);
7660 movl(op1, Address(in, 0));
7661 movl(sum, Address(out, offs, Address::times_4, -4));
7662
7663 movl(raxReg, k);
7664 mull(op1); //tmp4 * eax -> edx:eax
7665 addl(sum, carry);
7666 adcl(rdxReg, 0);
7667 addl(sum, raxReg);
7668 adcl(rdxReg, 0);
7669 movl(carry, rdxReg);
7670
7671 movl(Address(out, offs, Address::times_4, -4), sum);
7672
7673 bind(L_carry);
7674 //return tmp5/carry as carry in rax
7675 movl(rax, carry);
7676
7677 bind(L_done);
7678 pop(tmp5);
7679 pop(tmp4);
7680 pop(tmp3);
7681 pop(tmp2);
7682 pop(tmp1);
7683 }
7684
7685 /**
7686 * Emits code to update CRC-32 with a byte value according to constants in table
7687 *
7688 * @param [in,out]crc Register containing the crc.
7689 * @param [in]val Register containing the byte to fold into the CRC.
7690 * @param [in]table Register containing the table of crc constants.
7691 *
7692 * uint32_t crc;
7693 * val = crc_table[(val ^ crc) & 0xFF];
7694 * crc = val ^ (crc >> 8);
7695 *
7696 */
7697 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7698 xorl(val, crc);
7699 andl(val, 0xFF);
7700 shrl(crc, 8); // unsigned shift
7701 xorl(crc, Address(table, val, Address::times_4, 0));
7702 }
7703
7704 /**
7705 * Fold 128-bit data chunk
7706 */
7707 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7708 if (UseAVX > 0) {
7709 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7710 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7711 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7712 pxor(xcrc, xtmp);
7713 } else {
7714 movdqa(xtmp, xcrc);
7715 pclmulhdq(xtmp, xK); // [123:64]
7716 pclmulldq(xcrc, xK); // [63:0]
7717 pxor(xcrc, xtmp);
7718 movdqu(xtmp, Address(buf, offset));
7719 pxor(xcrc, xtmp);
7720 }
7721 }
7722
7723 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7724 if (UseAVX > 0) {
7725 vpclmulhdq(xtmp, xK, xcrc);
7726 vpclmulldq(xcrc, xK, xcrc);
7727 pxor(xcrc, xbuf);
7728 pxor(xcrc, xtmp);
7729 } else {
7730 movdqa(xtmp, xcrc);
7731 pclmulhdq(xtmp, xK);
7732 pclmulldq(xcrc, xK);
7733 pxor(xcrc, xbuf);
7734 pxor(xcrc, xtmp);
7735 }
7736 }
7737
7738 /**
7739 * 8-bit folds to compute 32-bit CRC
7740 *
7741 * uint64_t xcrc;
7742 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7743 */
7744 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7745 movdl(tmp, xcrc);
7746 andl(tmp, 0xFF);
7747 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7748 psrldq(xcrc, 1); // unsigned shift one byte
7749 pxor(xcrc, xtmp);
7750 }
7751
7752 /**
7753 * uint32_t crc;
7754 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7755 */
7756 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7757 movl(tmp, crc);
7758 andl(tmp, 0xFF);
7759 shrl(crc, 8);
7760 xorl(crc, Address(table, tmp, Address::times_4, 0));
7761 }
7762
7763 /**
7764 * @param crc register containing existing CRC (32-bit)
7765 * @param buf register pointing to input byte buffer (byte*)
7766 * @param len register containing number of bytes
7767 * @param table register that will contain address of CRC table
7768 * @param tmp scratch register
7769 */
7770 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7771 assert_different_registers(crc, buf, len, table, tmp, rax);
7772
7773 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7774 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7775
7776 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7777 // context for the registers used, where all instructions below are using 128-bit mode
7778 // On EVEX without VL and BW, these instructions will all be AVX.
7779 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7780 notl(crc); // ~crc
7781 cmpl(len, 16);
7782 jcc(Assembler::less, L_tail);
7783
7784 // Align buffer to 16 bytes
7785 movl(tmp, buf);
7786 andl(tmp, 0xF);
7787 jccb(Assembler::zero, L_aligned);
7788 subl(tmp, 16);
7789 addl(len, tmp);
7790
7791 align(4);
7792 BIND(L_align_loop);
7793 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7794 update_byte_crc32(crc, rax, table);
7795 increment(buf);
7796 incrementl(tmp);
7797 jccb(Assembler::less, L_align_loop);
7798
7799 BIND(L_aligned);
7800 movl(tmp, len); // save
7801 shrl(len, 4);
7802 jcc(Assembler::zero, L_tail_restore);
7803
7804 // Fold crc into first bytes of vector
7805 movdqa(xmm1, Address(buf, 0));
7806 movdl(rax, xmm1);
7807 xorl(crc, rax);
7808 if (VM_Version::supports_sse4_1()) {
7809 pinsrd(xmm1, crc, 0);
7810 } else {
7811 pinsrw(xmm1, crc, 0);
7812 shrl(crc, 16);
7813 pinsrw(xmm1, crc, 1);
7814 }
7815 addptr(buf, 16);
7816 subl(len, 4); // len > 0
7817 jcc(Assembler::less, L_fold_tail);
7818
7819 movdqa(xmm2, Address(buf, 0));
7820 movdqa(xmm3, Address(buf, 16));
7821 movdqa(xmm4, Address(buf, 32));
7822 addptr(buf, 48);
7823 subl(len, 3);
7824 jcc(Assembler::lessEqual, L_fold_512b);
7825
7826 // Fold total 512 bits of polynomial on each iteration,
7827 // 128 bits per each of 4 parallel streams.
7828 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
7829
7830 align32();
7831 BIND(L_fold_512b_loop);
7832 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7833 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7834 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7835 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7836 addptr(buf, 64);
7837 subl(len, 4);
7838 jcc(Assembler::greater, L_fold_512b_loop);
7839
7840 // Fold 512 bits to 128 bits.
7841 BIND(L_fold_512b);
7842 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7843 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7844 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7845 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7846
7847 // Fold the rest of 128 bits data chunks
7848 BIND(L_fold_tail);
7849 addl(len, 3);
7850 jccb(Assembler::lessEqual, L_fold_128b);
7851 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7852
7853 BIND(L_fold_tail_loop);
7854 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7855 addptr(buf, 16);
7856 decrementl(len);
7857 jccb(Assembler::greater, L_fold_tail_loop);
7858
7859 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7860 BIND(L_fold_128b);
7861 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
7862 if (UseAVX > 0) {
7863 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7864 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7865 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7866 } else {
7867 movdqa(xmm2, xmm0);
7868 pclmulqdq(xmm2, xmm1, 0x1);
7869 movdqa(xmm3, xmm0);
7870 pand(xmm3, xmm2);
7871 pclmulqdq(xmm0, xmm3, 0x1);
7872 }
7873 psrldq(xmm1, 8);
7874 psrldq(xmm2, 4);
7875 pxor(xmm0, xmm1);
7876 pxor(xmm0, xmm2);
7877
7878 // 8 8-bit folds to compute 32-bit CRC.
7879 for (int j = 0; j < 4; j++) {
7880 fold_8bit_crc32(xmm0, table, xmm1, rax);
7881 }
7882 movdl(crc, xmm0); // mov 32 bits to general register
7883 for (int j = 0; j < 4; j++) {
7884 fold_8bit_crc32(crc, table, rax);
7885 }
7886
7887 BIND(L_tail_restore);
7888 movl(len, tmp); // restore
7889 BIND(L_tail);
7890 andl(len, 0xf);
7891 jccb(Assembler::zero, L_exit);
7892
7893 // Fold the rest of bytes
7894 align(4);
7895 BIND(L_tail_loop);
7896 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7897 update_byte_crc32(crc, rax, table);
7898 increment(buf);
7899 decrementl(len);
7900 jccb(Assembler::greater, L_tail_loop);
7901
7902 BIND(L_exit);
7903 notl(crc); // ~c
7904 }
7905
7906 // Helper function for AVX 512 CRC32
7907 // Fold 512-bit data chunks
7908 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7909 Register pos, int offset) {
7910 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7911 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7912 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7913 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7914 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7915 }
7916
7917 // Helper function for AVX 512 CRC32
7918 // Compute CRC32 for < 256B buffers
7919 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7920 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7921 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7922
7923 Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7924 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7925 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7926
7927 // check if there is enough buffer to be able to fold 16B at a time
7928 cmpl(len, 32);
7929 jcc(Assembler::less, L_less_than_32);
7930
7931 // if there is, load the constants
7932 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
7933 movdl(xmm0, crc); // get the initial crc value
7934 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7935 pxor(xmm7, xmm0);
7936
7937 // update the buffer pointer
7938 addl(pos, 16);
7939 //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7940 subl(len, 32);
7941 jmp(L_16B_reduction_loop);
7942
7943 bind(L_less_than_32);
7944 //mov initial crc to the return value. this is necessary for zero - length buffers.
7945 movl(rax, crc);
7946 testl(len, len);
7947 jcc(Assembler::equal, L_cleanup);
7948
7949 movdl(xmm0, crc); //get the initial crc value
7950
7951 cmpl(len, 16);
7952 jcc(Assembler::equal, L_exact_16_left);
7953 jcc(Assembler::less, L_less_than_16_left);
7954
7955 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7956 pxor(xmm7, xmm0); //xor the initial crc value
7957 addl(pos, 16);
7958 subl(len, 16);
7959 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
7960 jmp(L_get_last_two_xmms);
7961
7962 bind(L_less_than_16_left);
7963 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7964 pxor(xmm1, xmm1);
7965 movptr(tmp1, rsp);
7966 movdqu(Address(tmp1, 0 * 16), xmm1);
7967
7968 cmpl(len, 4);
7969 jcc(Assembler::less, L_only_less_than_4);
7970
7971 //backup the counter value
7972 movl(tmp2, len);
7973 cmpl(len, 8);
7974 jcc(Assembler::less, L_less_than_8_left);
7975
7976 //load 8 Bytes
7977 movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7978 movq(Address(tmp1, 0 * 16), rax);
7979 addptr(tmp1, 8);
7980 subl(len, 8);
7981 addl(pos, 8);
7982
7983 bind(L_less_than_8_left);
7984 cmpl(len, 4);
7985 jcc(Assembler::less, L_less_than_4_left);
7986
7987 //load 4 Bytes
7988 movl(rax, Address(buf, pos, Address::times_1, 0));
7989 movl(Address(tmp1, 0 * 16), rax);
7990 addptr(tmp1, 4);
7991 subl(len, 4);
7992 addl(pos, 4);
7993
7994 bind(L_less_than_4_left);
7995 cmpl(len, 2);
7996 jcc(Assembler::less, L_less_than_2_left);
7997
7998 // load 2 Bytes
7999 movw(rax, Address(buf, pos, Address::times_1, 0));
8000 movl(Address(tmp1, 0 * 16), rax);
8001 addptr(tmp1, 2);
8002 subl(len, 2);
8003 addl(pos, 2);
8004
8005 bind(L_less_than_2_left);
8006 cmpl(len, 1);
8007 jcc(Assembler::less, L_zero_left);
8008
8009 // load 1 Byte
8010 movb(rax, Address(buf, pos, Address::times_1, 0));
8011 movb(Address(tmp1, 0 * 16), rax);
8012
8013 bind(L_zero_left);
8014 movdqu(xmm7, Address(rsp, 0));
8015 pxor(xmm7, xmm0); //xor the initial crc value
8016
8017 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8018 movdqu(xmm0, Address(rax, tmp2));
8019 pshufb(xmm7, xmm0);
8020 jmp(L_128_done);
8021
8022 bind(L_exact_16_left);
8023 movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
8024 pxor(xmm7, xmm0); //xor the initial crc value
8025 jmp(L_128_done);
8026
8027 bind(L_only_less_than_4);
8028 cmpl(len, 3);
8029 jcc(Assembler::less, L_only_less_than_3);
8030
8031 // load 3 Bytes
8032 movb(rax, Address(buf, pos, Address::times_1, 0));
8033 movb(Address(tmp1, 0), rax);
8034
8035 movb(rax, Address(buf, pos, Address::times_1, 1));
8036 movb(Address(tmp1, 1), rax);
8037
8038 movb(rax, Address(buf, pos, Address::times_1, 2));
8039 movb(Address(tmp1, 2), rax);
8040
8041 movdqu(xmm7, Address(rsp, 0));
8042 pxor(xmm7, xmm0); //xor the initial crc value
8043
8044 pslldq(xmm7, 0x5);
8045 jmp(L_barrett);
8046 bind(L_only_less_than_3);
8047 cmpl(len, 2);
8048 jcc(Assembler::less, L_only_less_than_2);
8049
8050 // load 2 Bytes
8051 movb(rax, Address(buf, pos, Address::times_1, 0));
8052 movb(Address(tmp1, 0), rax);
8053
8054 movb(rax, Address(buf, pos, Address::times_1, 1));
8055 movb(Address(tmp1, 1), rax);
8056
8057 movdqu(xmm7, Address(rsp, 0));
8058 pxor(xmm7, xmm0); //xor the initial crc value
8059
8060 pslldq(xmm7, 0x6);
8061 jmp(L_barrett);
8062
8063 bind(L_only_less_than_2);
8064 //load 1 Byte
8065 movb(rax, Address(buf, pos, Address::times_1, 0));
8066 movb(Address(tmp1, 0), rax);
8067
8068 movdqu(xmm7, Address(rsp, 0));
8069 pxor(xmm7, xmm0); //xor the initial crc value
8070
8071 pslldq(xmm7, 0x7);
8072 }
8073
8074 /**
8075 * Compute CRC32 using AVX512 instructions
8076 * param crc register containing existing CRC (32-bit)
8077 * param buf register pointing to input byte buffer (byte*)
8078 * param len register containing number of bytes
8079 * param table address of crc or crc32c table
8080 * param tmp1 scratch register
8081 * param tmp2 scratch register
8082 * return rax result register
8083 *
8084 * This routine is identical for crc32c with the exception of the precomputed constant
8085 * table which will be passed as the table argument. The calculation steps are
8086 * the same for both variants.
8087 */
8088 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
8089 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
8090
8091 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8092 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8093 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
8094 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
8095 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
8096
8097 const Register pos = r12;
8098 push(r12);
8099 subptr(rsp, 16 * 2 + 8);
8100
8101 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8102 // context for the registers used, where all instructions below are using 128-bit mode
8103 // On EVEX without VL and BW, these instructions will all be AVX.
8104 movl(pos, 0);
8105
8106 // check if smaller than 256B
8107 cmpl(len, 256);
8108 jcc(Assembler::less, L_less_than_256);
8109
8110 // load the initial crc value
8111 movdl(xmm10, crc);
8112
8113 // receive the initial 64B data, xor the initial crc value
8114 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
8115 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
8116 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
8117 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
8118
8119 subl(len, 256);
8120 cmpl(len, 256);
8121 jcc(Assembler::less, L_fold_128_B_loop);
8122
8123 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
8124 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
8125 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
8126 subl(len, 256);
8127
8128 bind(L_fold_256_B_loop);
8129 addl(pos, 256);
8130 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
8131 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
8132 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
8133 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
8134
8135 subl(len, 256);
8136 jcc(Assembler::greaterEqual, L_fold_256_B_loop);
8137
8138 // Fold 256 into 128
8139 addl(pos, 256);
8140 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
8141 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
8142 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
8143
8144 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
8145 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
8146 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
8147
8148 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
8149 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
8150
8151 addl(len, 128);
8152 jmp(L_fold_128_B_register);
8153
8154 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
8155 // loop will fold 128B at a time until we have 128 + y Bytes of buffer
8156
8157 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
8158 bind(L_fold_128_B_loop);
8159 addl(pos, 128);
8160 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
8161 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
8162
8163 subl(len, 128);
8164 jcc(Assembler::greaterEqual, L_fold_128_B_loop);
8165
8166 addl(pos, 128);
8167
8168 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
8169 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
8170 bind(L_fold_128_B_register);
8171 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
8172 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
8173 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
8174 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
8175 // save last that has no multiplicand
8176 vextracti64x2(xmm7, xmm4, 3);
8177
8178 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
8179 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
8180 // Needed later in reduction loop
8181 movdqu(xmm10, Address(table, 1 * 16));
8182 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
8183 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
8184
8185 // Swap 1,0,3,2 - 01 00 11 10
8186 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
8187 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
8188 vextracti128(xmm5, xmm8, 1);
8189 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
8190
8191 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
8192 // instead of a cmp instruction, we use the negative flag with the jl instruction
8193 addl(len, 128 - 16);
8194 jcc(Assembler::less, L_final_reduction_for_128);
8195
8196 bind(L_16B_reduction_loop);
8197 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8198 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8199 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8200 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
8201 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8202 addl(pos, 16);
8203 subl(len, 16);
8204 jcc(Assembler::greaterEqual, L_16B_reduction_loop);
8205
8206 bind(L_final_reduction_for_128);
8207 addl(len, 16);
8208 jcc(Assembler::equal, L_128_done);
8209
8210 bind(L_get_last_two_xmms);
8211 movdqu(xmm2, xmm7);
8212 addl(pos, len);
8213 movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
8214 subl(pos, len);
8215
8216 // get rid of the extra data that was loaded before
8217 // load the shift constant
8218 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8219 movdqu(xmm0, Address(rax, len));
8220 addl(rax, len);
8221
8222 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8223 //Change mask to 512
8224 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
8225 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
8226
8227 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
8228 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8229 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8230 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8231 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
8232
8233 bind(L_128_done);
8234 // compute crc of a 128-bit value
8235 movdqu(xmm10, Address(table, 3 * 16));
8236 movdqu(xmm0, xmm7);
8237
8238 // 64b fold
8239 vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
8240 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
8241 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8242
8243 // 32b fold
8244 movdqu(xmm0, xmm7);
8245 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
8246 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8247 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8248 jmp(L_barrett);
8249
8250 bind(L_less_than_256);
8251 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
8252
8253 //barrett reduction
8254 bind(L_barrett);
8255 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
8256 movdqu(xmm1, xmm7);
8257 movdqu(xmm2, xmm7);
8258 movdqu(xmm10, Address(table, 4 * 16));
8259
8260 pclmulqdq(xmm7, xmm10, 0x0);
8261 pxor(xmm7, xmm2);
8262 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
8263 movdqu(xmm2, xmm7);
8264 pclmulqdq(xmm7, xmm10, 0x10);
8265 pxor(xmm7, xmm2);
8266 pxor(xmm7, xmm1);
8267 pextrd(crc, xmm7, 2);
8268
8269 bind(L_cleanup);
8270 addptr(rsp, 16 * 2 + 8);
8271 pop(r12);
8272 }
8273
8274 // S. Gueron / Information Processing Letters 112 (2012) 184
8275 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8276 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8277 // Output: the 64-bit carry-less product of B * CONST
8278 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8279 Register tmp1, Register tmp2, Register tmp3) {
8280 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8281 if (n > 0) {
8282 addq(tmp3, n * 256 * 8);
8283 }
8284 // Q1 = TABLEExt[n][B & 0xFF];
8285 movl(tmp1, in);
8286 andl(tmp1, 0x000000FF);
8287 shll(tmp1, 3);
8288 addq(tmp1, tmp3);
8289 movq(tmp1, Address(tmp1, 0));
8290
8291 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
8292 movl(tmp2, in);
8293 shrl(tmp2, 8);
8294 andl(tmp2, 0x000000FF);
8295 shll(tmp2, 3);
8296 addq(tmp2, tmp3);
8297 movq(tmp2, Address(tmp2, 0));
8298
8299 shlq(tmp2, 8);
8300 xorq(tmp1, tmp2);
8301
8302 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
8303 movl(tmp2, in);
8304 shrl(tmp2, 16);
8305 andl(tmp2, 0x000000FF);
8306 shll(tmp2, 3);
8307 addq(tmp2, tmp3);
8308 movq(tmp2, Address(tmp2, 0));
8309
8310 shlq(tmp2, 16);
8311 xorq(tmp1, tmp2);
8312
8313 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
8314 shrl(in, 24);
8315 andl(in, 0x000000FF);
8316 shll(in, 3);
8317 addq(in, tmp3);
8318 movq(in, Address(in, 0));
8319
8320 shlq(in, 24);
8321 xorq(in, tmp1);
8322 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8323 }
8324
8325 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8326 Register in_out,
8327 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8328 XMMRegister w_xtmp2,
8329 Register tmp1,
8330 Register n_tmp2, Register n_tmp3) {
8331 if (is_pclmulqdq_supported) {
8332 movdl(w_xtmp1, in_out); // modified blindly
8333
8334 movl(tmp1, const_or_pre_comp_const_index);
8335 movdl(w_xtmp2, tmp1);
8336 pclmulqdq(w_xtmp1, w_xtmp2, 0);
8337
8338 movdq(in_out, w_xtmp1);
8339 } else {
8340 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8341 }
8342 }
8343
8344 // Recombination Alternative 2: No bit-reflections
8345 // T1 = (CRC_A * U1) << 1
8346 // T2 = (CRC_B * U2) << 1
8347 // C1 = T1 >> 32
8348 // C2 = T2 >> 32
8349 // T1 = T1 & 0xFFFFFFFF
8350 // T2 = T2 & 0xFFFFFFFF
8351 // T1 = CRC32(0, T1)
8352 // T2 = CRC32(0, T2)
8353 // C1 = C1 ^ T1
8354 // C2 = C2 ^ T2
8355 // CRC = C1 ^ C2 ^ CRC_C
8356 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8357 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8358 Register tmp1, Register tmp2,
8359 Register n_tmp3) {
8360 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8361 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8362 shlq(in_out, 1);
8363 movl(tmp1, in_out);
8364 shrq(in_out, 32);
8365 xorl(tmp2, tmp2);
8366 crc32(tmp2, tmp1, 4);
8367 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8368 shlq(in1, 1);
8369 movl(tmp1, in1);
8370 shrq(in1, 32);
8371 xorl(tmp2, tmp2);
8372 crc32(tmp2, tmp1, 4);
8373 xorl(in1, tmp2);
8374 xorl(in_out, in1);
8375 xorl(in_out, in2);
8376 }
8377
8378 // Set N to predefined value
8379 // Subtract from a length of a buffer
8380 // execute in a loop:
8381 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8382 // for i = 1 to N do
8383 // CRC_A = CRC32(CRC_A, A[i])
8384 // CRC_B = CRC32(CRC_B, B[i])
8385 // CRC_C = CRC32(CRC_C, C[i])
8386 // end for
8387 // Recombine
8388 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8389 Register in_out1, Register in_out2, Register in_out3,
8390 Register tmp1, Register tmp2, Register tmp3,
8391 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8392 Register tmp4, Register tmp5,
8393 Register n_tmp6) {
8394 Label L_processPartitions;
8395 Label L_processPartition;
8396 Label L_exit;
8397
8398 bind(L_processPartitions);
8399 cmpl(in_out1, 3 * size);
8400 jcc(Assembler::less, L_exit);
8401 xorl(tmp1, tmp1);
8402 xorl(tmp2, tmp2);
8403 movq(tmp3, in_out2);
8404 addq(tmp3, size);
8405
8406 bind(L_processPartition);
8407 crc32(in_out3, Address(in_out2, 0), 8);
8408 crc32(tmp1, Address(in_out2, size), 8);
8409 crc32(tmp2, Address(in_out2, size * 2), 8);
8410 addq(in_out2, 8);
8411 cmpq(in_out2, tmp3);
8412 jcc(Assembler::less, L_processPartition);
8413 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8414 w_xtmp1, w_xtmp2, w_xtmp3,
8415 tmp4, tmp5,
8416 n_tmp6);
8417 addq(in_out2, 2 * size);
8418 subl(in_out1, 3 * size);
8419 jmp(L_processPartitions);
8420
8421 bind(L_exit);
8422 }
8423
8424 // Algorithm 2: Pipelined usage of the CRC32 instruction.
8425 // Input: A buffer I of L bytes.
8426 // Output: the CRC32C value of the buffer.
8427 // Notations:
8428 // Write L = 24N + r, with N = floor (L/24).
8429 // r = L mod 24 (0 <= r < 24).
8430 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8431 // N quadwords, and R consists of r bytes.
8432 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8433 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8434 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8435 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8436 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8437 Register tmp1, Register tmp2, Register tmp3,
8438 Register tmp4, Register tmp5, Register tmp6,
8439 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8440 bool is_pclmulqdq_supported) {
8441 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8442 Label L_wordByWord;
8443 Label L_byteByByteProlog;
8444 Label L_byteByByte;
8445 Label L_exit;
8446
8447 if (is_pclmulqdq_supported ) {
8448 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr();
8449 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1);
8450
8451 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2);
8452 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3);
8453
8454 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4);
8455 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5);
8456 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8457 } else {
8458 const_or_pre_comp_const_index[0] = 1;
8459 const_or_pre_comp_const_index[1] = 0;
8460
8461 const_or_pre_comp_const_index[2] = 3;
8462 const_or_pre_comp_const_index[3] = 2;
8463
8464 const_or_pre_comp_const_index[4] = 5;
8465 const_or_pre_comp_const_index[5] = 4;
8466 }
8467 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8468 in2, in1, in_out,
8469 tmp1, tmp2, tmp3,
8470 w_xtmp1, w_xtmp2, w_xtmp3,
8471 tmp4, tmp5,
8472 tmp6);
8473 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8474 in2, in1, in_out,
8475 tmp1, tmp2, tmp3,
8476 w_xtmp1, w_xtmp2, w_xtmp3,
8477 tmp4, tmp5,
8478 tmp6);
8479 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8480 in2, in1, in_out,
8481 tmp1, tmp2, tmp3,
8482 w_xtmp1, w_xtmp2, w_xtmp3,
8483 tmp4, tmp5,
8484 tmp6);
8485 movl(tmp1, in2);
8486 andl(tmp1, 0x00000007);
8487 negl(tmp1);
8488 addl(tmp1, in2);
8489 addq(tmp1, in1);
8490
8491 cmpq(in1, tmp1);
8492 jccb(Assembler::greaterEqual, L_byteByByteProlog);
8493 align(16);
8494 BIND(L_wordByWord);
8495 crc32(in_out, Address(in1, 0), 8);
8496 addq(in1, 8);
8497 cmpq(in1, tmp1);
8498 jcc(Assembler::less, L_wordByWord);
8499
8500 BIND(L_byteByByteProlog);
8501 andl(in2, 0x00000007);
8502 movl(tmp2, 1);
8503
8504 cmpl(tmp2, in2);
8505 jccb(Assembler::greater, L_exit);
8506 BIND(L_byteByByte);
8507 crc32(in_out, Address(in1, 0), 1);
8508 incq(in1);
8509 incl(tmp2);
8510 cmpl(tmp2, in2);
8511 jcc(Assembler::lessEqual, L_byteByByte);
8512
8513 BIND(L_exit);
8514 }
8515 #undef BIND
8516 #undef BLOCK_COMMENT
8517
8518 // Compress char[] array to byte[].
8519 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
8520 // Return the array length if every element in array can be encoded,
8521 // otherwise, the index of first non-latin1 (> 0xff) character.
8522 // @IntrinsicCandidate
8523 // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8524 // for (int i = 0; i < len; i++) {
8525 // char c = src[srcOff];
8526 // if (c > 0xff) {
8527 // return i; // return index of non-latin1 char
8528 // }
8529 // dst[dstOff] = (byte)c;
8530 // srcOff++;
8531 // dstOff++;
8532 // }
8533 // return len;
8534 // }
8535 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8536 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8537 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8538 Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8539 Label copy_chars_loop, done, reset_sp, copy_tail;
8540
8541 // rsi: src
8542 // rdi: dst
8543 // rdx: len
8544 // rcx: tmp5
8545 // rax: result
8546
8547 // rsi holds start addr of source char[] to be compressed
8548 // rdi holds start addr of destination byte[]
8549 // rdx holds length
8550
8551 assert(len != result, "");
8552
8553 // save length for return
8554 movl(result, len);
8555
8556 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8557 VM_Version::supports_avx512vlbw() &&
8558 VM_Version::supports_bmi2()) {
8559
8560 Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
8561
8562 // alignment
8563 Label post_alignment;
8564
8565 // if length of the string is less than 32, handle it the old fashioned way
8566 testl(len, -32);
8567 jcc(Assembler::zero, below_threshold);
8568
8569 // First check whether a character is compressible ( <= 0xFF).
8570 // Create mask to test for Unicode chars inside zmm vector
8571 movl(tmp5, 0x00FF);
8572 evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
8573
8574 testl(len, -64);
8575 jccb(Assembler::zero, post_alignment);
8576
8577 movl(tmp5, dst);
8578 andl(tmp5, (32 - 1));
8579 negl(tmp5);
8580 andl(tmp5, (32 - 1));
8581
8582 // bail out when there is nothing to be done
8583 testl(tmp5, 0xFFFFFFFF);
8584 jccb(Assembler::zero, post_alignment);
8585
8586 // ~(~0 << len), where len is the # of remaining elements to process
8587 movl(len, 0xFFFFFFFF);
8588 shlxl(len, len, tmp5);
8589 notl(len);
8590 kmovdl(mask2, len);
8591 movl(len, result);
8592
8593 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8594 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8595 ktestd(mask1, mask2);
8596 jcc(Assembler::carryClear, copy_tail);
8597
8598 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8599
8600 addptr(src, tmp5);
8601 addptr(src, tmp5);
8602 addptr(dst, tmp5);
8603 subl(len, tmp5);
8604
8605 bind(post_alignment);
8606 // end of alignment
8607
8608 movl(tmp5, len);
8609 andl(tmp5, (32 - 1)); // tail count (in chars)
8610 andl(len, ~(32 - 1)); // vector count (in chars)
8611 jccb(Assembler::zero, copy_loop_tail);
8612
8613 lea(src, Address(src, len, Address::times_2));
8614 lea(dst, Address(dst, len, Address::times_1));
8615 negptr(len);
8616
8617 bind(copy_32_loop);
8618 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
8619 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8620 kortestdl(mask1, mask1);
8621 jccb(Assembler::carryClear, reset_for_copy_tail);
8622
8623 // All elements in current processed chunk are valid candidates for
8624 // compression. Write a truncated byte elements to the memory.
8625 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8626 addptr(len, 32);
8627 jccb(Assembler::notZero, copy_32_loop);
8628
8629 bind(copy_loop_tail);
8630 // bail out when there is nothing to be done
8631 testl(tmp5, 0xFFFFFFFF);
8632 jcc(Assembler::zero, done);
8633
8634 movl(len, tmp5);
8635
8636 // ~(~0 << len), where len is the # of remaining elements to process
8637 movl(tmp5, 0xFFFFFFFF);
8638 shlxl(tmp5, tmp5, len);
8639 notl(tmp5);
8640
8641 kmovdl(mask2, tmp5);
8642
8643 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8644 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8645 ktestd(mask1, mask2);
8646 jcc(Assembler::carryClear, copy_tail);
8647
8648 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8649 jmp(done);
8650
8651 bind(reset_for_copy_tail);
8652 lea(src, Address(src, tmp5, Address::times_2));
8653 lea(dst, Address(dst, tmp5, Address::times_1));
8654 subptr(len, tmp5);
8655 jmp(copy_chars_loop);
8656
8657 bind(below_threshold);
8658 }
8659
8660 if (UseSSE42Intrinsics) {
8661 Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
8662
8663 // vectored compression
8664 testl(len, 0xfffffff8);
8665 jcc(Assembler::zero, copy_tail);
8666
8667 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
8668 movdl(tmp1Reg, tmp5);
8669 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
8670
8671 andl(len, 0xfffffff0);
8672 jccb(Assembler::zero, copy_16);
8673
8674 // compress 16 chars per iter
8675 pxor(tmp4Reg, tmp4Reg);
8676
8677 lea(src, Address(src, len, Address::times_2));
8678 lea(dst, Address(dst, len, Address::times_1));
8679 negptr(len);
8680
8681 bind(copy_32_loop);
8682 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
8683 por(tmp4Reg, tmp2Reg);
8684 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8685 por(tmp4Reg, tmp3Reg);
8686 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
8687 jccb(Assembler::notZero, reset_for_copy_tail);
8688 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
8689 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8690 addptr(len, 16);
8691 jccb(Assembler::notZero, copy_32_loop);
8692
8693 // compress next vector of 8 chars (if any)
8694 bind(copy_16);
8695 // len = 0
8696 testl(result, 0x00000008); // check if there's a block of 8 chars to compress
8697 jccb(Assembler::zero, copy_tail_sse);
8698
8699 pxor(tmp3Reg, tmp3Reg);
8700
8701 movdqu(tmp2Reg, Address(src, 0));
8702 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8703 jccb(Assembler::notZero, reset_for_copy_tail);
8704 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
8705 movq(Address(dst, 0), tmp2Reg);
8706 addptr(src, 16);
8707 addptr(dst, 8);
8708 jmpb(copy_tail_sse);
8709
8710 bind(reset_for_copy_tail);
8711 movl(tmp5, result);
8712 andl(tmp5, 0x0000000f);
8713 lea(src, Address(src, tmp5, Address::times_2));
8714 lea(dst, Address(dst, tmp5, Address::times_1));
8715 subptr(len, tmp5);
8716 jmpb(copy_chars_loop);
8717
8718 bind(copy_tail_sse);
8719 movl(len, result);
8720 andl(len, 0x00000007); // tail count (in chars)
8721 }
8722 // compress 1 char per iter
8723 bind(copy_tail);
8724 testl(len, len);
8725 jccb(Assembler::zero, done);
8726 lea(src, Address(src, len, Address::times_2));
8727 lea(dst, Address(dst, len, Address::times_1));
8728 negptr(len);
8729
8730 bind(copy_chars_loop);
8731 load_unsigned_short(tmp5, Address(src, len, Address::times_2));
8732 testl(tmp5, 0xff00); // check if Unicode char
8733 jccb(Assembler::notZero, reset_sp);
8734 movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte
8735 increment(len);
8736 jccb(Assembler::notZero, copy_chars_loop);
8737
8738 // add len then return (len will be zero if compress succeeded, otherwise negative)
8739 bind(reset_sp);
8740 addl(result, len);
8741
8742 bind(done);
8743 }
8744
8745 // Inflate byte[] array to char[].
8746 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8747 // @IntrinsicCandidate
8748 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8749 // for (int i = 0; i < len; i++) {
8750 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8751 // }
8752 // }
8753 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8754 XMMRegister tmp1, Register tmp2, KRegister mask) {
8755 Label copy_chars_loop, done, below_threshold, avx3_threshold;
8756 // rsi: src
8757 // rdi: dst
8758 // rdx: len
8759 // rcx: tmp2
8760
8761 // rsi holds start addr of source byte[] to be inflated
8762 // rdi holds start addr of destination char[]
8763 // rdx holds length
8764 assert_different_registers(src, dst, len, tmp2);
8765 movl(tmp2, len);
8766 if ((UseAVX > 2) && // AVX512
8767 VM_Version::supports_avx512vlbw() &&
8768 VM_Version::supports_bmi2()) {
8769
8770 Label copy_32_loop, copy_tail;
8771 Register tmp3_aliased = len;
8772
8773 // if length of the string is less than 16, handle it in an old fashioned way
8774 testl(len, -16);
8775 jcc(Assembler::zero, below_threshold);
8776
8777 testl(len, -1 * AVX3Threshold);
8778 jcc(Assembler::zero, avx3_threshold);
8779
8780 // In order to use only one arithmetic operation for the main loop we use
8781 // this pre-calculation
8782 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8783 andl(len, -32); // vector count
8784 jccb(Assembler::zero, copy_tail);
8785
8786 lea(src, Address(src, len, Address::times_1));
8787 lea(dst, Address(dst, len, Address::times_2));
8788 negptr(len);
8789
8790
8791 // inflate 32 chars per iter
8792 bind(copy_32_loop);
8793 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8794 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
8795 addptr(len, 32);
8796 jcc(Assembler::notZero, copy_32_loop);
8797
8798 bind(copy_tail);
8799 // bail out when there is nothing to be done
8800 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8801 jcc(Assembler::zero, done);
8802
8803 // ~(~0 << length), where length is the # of remaining elements to process
8804 movl(tmp3_aliased, -1);
8805 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8806 notl(tmp3_aliased);
8807 kmovdl(mask, tmp3_aliased);
8808 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8809 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8810
8811 jmp(done);
8812 bind(avx3_threshold);
8813 }
8814 if (UseSSE42Intrinsics) {
8815 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8816
8817 if (UseAVX > 1) {
8818 andl(tmp2, (16 - 1));
8819 andl(len, -16);
8820 jccb(Assembler::zero, copy_new_tail);
8821 } else {
8822 andl(tmp2, 0x00000007); // tail count (in chars)
8823 andl(len, 0xfffffff8); // vector count (in chars)
8824 jccb(Assembler::zero, copy_tail);
8825 }
8826
8827 // vectored inflation
8828 lea(src, Address(src, len, Address::times_1));
8829 lea(dst, Address(dst, len, Address::times_2));
8830 negptr(len);
8831
8832 if (UseAVX > 1) {
8833 bind(copy_16_loop);
8834 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8835 vmovdqu(Address(dst, len, Address::times_2), tmp1);
8836 addptr(len, 16);
8837 jcc(Assembler::notZero, copy_16_loop);
8838
8839 bind(below_threshold);
8840 bind(copy_new_tail);
8841 movl(len, tmp2);
8842 andl(tmp2, 0x00000007);
8843 andl(len, 0xFFFFFFF8);
8844 jccb(Assembler::zero, copy_tail);
8845
8846 pmovzxbw(tmp1, Address(src, 0));
8847 movdqu(Address(dst, 0), tmp1);
8848 addptr(src, 8);
8849 addptr(dst, 2 * 8);
8850
8851 jmp(copy_tail, true);
8852 }
8853
8854 // inflate 8 chars per iter
8855 bind(copy_8_loop);
8856 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
8857 movdqu(Address(dst, len, Address::times_2), tmp1);
8858 addptr(len, 8);
8859 jcc(Assembler::notZero, copy_8_loop);
8860
8861 bind(copy_tail);
8862 movl(len, tmp2);
8863
8864 cmpl(len, 4);
8865 jccb(Assembler::less, copy_bytes);
8866
8867 movdl(tmp1, Address(src, 0)); // load 4 byte chars
8868 pmovzxbw(tmp1, tmp1);
8869 movq(Address(dst, 0), tmp1);
8870 subptr(len, 4);
8871 addptr(src, 4);
8872 addptr(dst, 8);
8873
8874 bind(copy_bytes);
8875 } else {
8876 bind(below_threshold);
8877 }
8878
8879 testl(len, len);
8880 jccb(Assembler::zero, done);
8881 lea(src, Address(src, len, Address::times_1));
8882 lea(dst, Address(dst, len, Address::times_2));
8883 negptr(len);
8884
8885 // inflate 1 char per iter
8886 bind(copy_chars_loop);
8887 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
8888 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
8889 increment(len);
8890 jcc(Assembler::notZero, copy_chars_loop);
8891
8892 bind(done);
8893 }
8894
8895 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
8896 switch(type) {
8897 case T_BYTE:
8898 case T_BOOLEAN:
8899 evmovdqub(dst, kmask, src, merge, vector_len);
8900 break;
8901 case T_CHAR:
8902 case T_SHORT:
8903 evmovdquw(dst, kmask, src, merge, vector_len);
8904 break;
8905 case T_INT:
8906 case T_FLOAT:
8907 evmovdqul(dst, kmask, src, merge, vector_len);
8908 break;
8909 case T_LONG:
8910 case T_DOUBLE:
8911 evmovdquq(dst, kmask, src, merge, vector_len);
8912 break;
8913 default:
8914 fatal("Unexpected type argument %s", type2name(type));
8915 break;
8916 }
8917 }
8918
8919
8920 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
8921 switch(type) {
8922 case T_BYTE:
8923 case T_BOOLEAN:
8924 evmovdqub(dst, kmask, src, merge, vector_len);
8925 break;
8926 case T_CHAR:
8927 case T_SHORT:
8928 evmovdquw(dst, kmask, src, merge, vector_len);
8929 break;
8930 case T_INT:
8931 case T_FLOAT:
8932 evmovdqul(dst, kmask, src, merge, vector_len);
8933 break;
8934 case T_LONG:
8935 case T_DOUBLE:
8936 evmovdquq(dst, kmask, src, merge, vector_len);
8937 break;
8938 default:
8939 fatal("Unexpected type argument %s", type2name(type));
8940 break;
8941 }
8942 }
8943
8944 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
8945 switch(type) {
8946 case T_BYTE:
8947 case T_BOOLEAN:
8948 evmovdqub(dst, kmask, src, merge, vector_len);
8949 break;
8950 case T_CHAR:
8951 case T_SHORT:
8952 evmovdquw(dst, kmask, src, merge, vector_len);
8953 break;
8954 case T_INT:
8955 case T_FLOAT:
8956 evmovdqul(dst, kmask, src, merge, vector_len);
8957 break;
8958 case T_LONG:
8959 case T_DOUBLE:
8960 evmovdquq(dst, kmask, src, merge, vector_len);
8961 break;
8962 default:
8963 fatal("Unexpected type argument %s", type2name(type));
8964 break;
8965 }
8966 }
8967
8968 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8969 switch(masklen) {
8970 case 2:
8971 knotbl(dst, src);
8972 movl(rtmp, 3);
8973 kmovbl(ktmp, rtmp);
8974 kandbl(dst, ktmp, dst);
8975 break;
8976 case 4:
8977 knotbl(dst, src);
8978 movl(rtmp, 15);
8979 kmovbl(ktmp, rtmp);
8980 kandbl(dst, ktmp, dst);
8981 break;
8982 case 8:
8983 knotbl(dst, src);
8984 break;
8985 case 16:
8986 knotwl(dst, src);
8987 break;
8988 case 32:
8989 knotdl(dst, src);
8990 break;
8991 case 64:
8992 knotql(dst, src);
8993 break;
8994 default:
8995 fatal("Unexpected vector length %d", masklen);
8996 break;
8997 }
8998 }
8999
9000 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9001 switch(type) {
9002 case T_BOOLEAN:
9003 case T_BYTE:
9004 kandbl(dst, src1, src2);
9005 break;
9006 case T_CHAR:
9007 case T_SHORT:
9008 kandwl(dst, src1, src2);
9009 break;
9010 case T_INT:
9011 case T_FLOAT:
9012 kanddl(dst, src1, src2);
9013 break;
9014 case T_LONG:
9015 case T_DOUBLE:
9016 kandql(dst, src1, src2);
9017 break;
9018 default:
9019 fatal("Unexpected type argument %s", type2name(type));
9020 break;
9021 }
9022 }
9023
9024 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9025 switch(type) {
9026 case T_BOOLEAN:
9027 case T_BYTE:
9028 korbl(dst, src1, src2);
9029 break;
9030 case T_CHAR:
9031 case T_SHORT:
9032 korwl(dst, src1, src2);
9033 break;
9034 case T_INT:
9035 case T_FLOAT:
9036 kordl(dst, src1, src2);
9037 break;
9038 case T_LONG:
9039 case T_DOUBLE:
9040 korql(dst, src1, src2);
9041 break;
9042 default:
9043 fatal("Unexpected type argument %s", type2name(type));
9044 break;
9045 }
9046 }
9047
9048 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9049 switch(type) {
9050 case T_BOOLEAN:
9051 case T_BYTE:
9052 kxorbl(dst, src1, src2);
9053 break;
9054 case T_CHAR:
9055 case T_SHORT:
9056 kxorwl(dst, src1, src2);
9057 break;
9058 case T_INT:
9059 case T_FLOAT:
9060 kxordl(dst, src1, src2);
9061 break;
9062 case T_LONG:
9063 case T_DOUBLE:
9064 kxorql(dst, src1, src2);
9065 break;
9066 default:
9067 fatal("Unexpected type argument %s", type2name(type));
9068 break;
9069 }
9070 }
9071
9072 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9073 switch(type) {
9074 case T_BOOLEAN:
9075 case T_BYTE:
9076 evpermb(dst, mask, nds, src, merge, vector_len); break;
9077 case T_CHAR:
9078 case T_SHORT:
9079 evpermw(dst, mask, nds, src, merge, vector_len); break;
9080 case T_INT:
9081 case T_FLOAT:
9082 evpermd(dst, mask, nds, src, merge, vector_len); break;
9083 case T_LONG:
9084 case T_DOUBLE:
9085 evpermq(dst, mask, nds, src, merge, vector_len); break;
9086 default:
9087 fatal("Unexpected type argument %s", type2name(type)); break;
9088 }
9089 }
9090
9091 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9092 switch(type) {
9093 case T_BOOLEAN:
9094 case T_BYTE:
9095 evpermb(dst, mask, nds, src, merge, vector_len); break;
9096 case T_CHAR:
9097 case T_SHORT:
9098 evpermw(dst, mask, nds, src, merge, vector_len); break;
9099 case T_INT:
9100 case T_FLOAT:
9101 evpermd(dst, mask, nds, src, merge, vector_len); break;
9102 case T_LONG:
9103 case T_DOUBLE:
9104 evpermq(dst, mask, nds, src, merge, vector_len); break;
9105 default:
9106 fatal("Unexpected type argument %s", type2name(type)); break;
9107 }
9108 }
9109
9110 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9111 switch(type) {
9112 case T_BYTE:
9113 evpminub(dst, mask, nds, src, merge, vector_len); break;
9114 case T_SHORT:
9115 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9116 case T_INT:
9117 evpminud(dst, mask, nds, src, merge, vector_len); break;
9118 case T_LONG:
9119 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9120 default:
9121 fatal("Unexpected type argument %s", type2name(type)); break;
9122 }
9123 }
9124
9125 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9126 switch(type) {
9127 case T_BYTE:
9128 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9129 case T_SHORT:
9130 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9131 case T_INT:
9132 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9133 case T_LONG:
9134 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9135 default:
9136 fatal("Unexpected type argument %s", type2name(type)); break;
9137 }
9138 }
9139
9140 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9141 switch(type) {
9142 case T_BYTE:
9143 evpminub(dst, mask, nds, src, merge, vector_len); break;
9144 case T_SHORT:
9145 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9146 case T_INT:
9147 evpminud(dst, mask, nds, src, merge, vector_len); break;
9148 case T_LONG:
9149 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9150 default:
9151 fatal("Unexpected type argument %s", type2name(type)); break;
9152 }
9153 }
9154
9155 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9156 switch(type) {
9157 case T_BYTE:
9158 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9159 case T_SHORT:
9160 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9161 case T_INT:
9162 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9163 case T_LONG:
9164 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9165 default:
9166 fatal("Unexpected type argument %s", type2name(type)); break;
9167 }
9168 }
9169
9170 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9171 switch(type) {
9172 case T_BYTE:
9173 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9174 case T_SHORT:
9175 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9176 case T_INT:
9177 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9178 case T_LONG:
9179 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9180 case T_FLOAT:
9181 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9182 case T_DOUBLE:
9183 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9184 default:
9185 fatal("Unexpected type argument %s", type2name(type)); break;
9186 }
9187 }
9188
9189 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9190 switch(type) {
9191 case T_BYTE:
9192 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9193 case T_SHORT:
9194 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9195 case T_INT:
9196 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9197 case T_LONG:
9198 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9199 case T_FLOAT:
9200 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9201 case T_DOUBLE:
9202 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9203 default:
9204 fatal("Unexpected type argument %s", type2name(type)); break;
9205 }
9206 }
9207
9208 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9209 switch(type) {
9210 case T_BYTE:
9211 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9212 case T_SHORT:
9213 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9214 case T_INT:
9215 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9216 case T_LONG:
9217 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9218 case T_FLOAT:
9219 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9220 case T_DOUBLE:
9221 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9222 default:
9223 fatal("Unexpected type argument %s", type2name(type)); break;
9224 }
9225 }
9226
9227 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9228 switch(type) {
9229 case T_BYTE:
9230 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9231 case T_SHORT:
9232 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9233 case T_INT:
9234 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9235 case T_LONG:
9236 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9237 case T_FLOAT:
9238 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9239 case T_DOUBLE:
9240 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9241 default:
9242 fatal("Unexpected type argument %s", type2name(type)); break;
9243 }
9244 }
9245
9246 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9247 switch(type) {
9248 case T_INT:
9249 evpxord(dst, mask, nds, src, merge, vector_len); break;
9250 case T_LONG:
9251 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9252 default:
9253 fatal("Unexpected type argument %s", type2name(type)); break;
9254 }
9255 }
9256
9257 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9258 switch(type) {
9259 case T_INT:
9260 evpxord(dst, mask, nds, src, merge, vector_len); break;
9261 case T_LONG:
9262 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9263 default:
9264 fatal("Unexpected type argument %s", type2name(type)); break;
9265 }
9266 }
9267
9268 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9269 switch(type) {
9270 case T_INT:
9271 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9272 case T_LONG:
9273 evporq(dst, mask, nds, src, merge, vector_len); break;
9274 default:
9275 fatal("Unexpected type argument %s", type2name(type)); break;
9276 }
9277 }
9278
9279 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9280 switch(type) {
9281 case T_INT:
9282 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9283 case T_LONG:
9284 evporq(dst, mask, nds, src, merge, vector_len); break;
9285 default:
9286 fatal("Unexpected type argument %s", type2name(type)); break;
9287 }
9288 }
9289
9290 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9291 switch(type) {
9292 case T_INT:
9293 evpandd(dst, mask, nds, src, merge, vector_len); break;
9294 case T_LONG:
9295 evpandq(dst, mask, nds, src, merge, vector_len); break;
9296 default:
9297 fatal("Unexpected type argument %s", type2name(type)); break;
9298 }
9299 }
9300
9301 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9302 switch(type) {
9303 case T_INT:
9304 evpandd(dst, mask, nds, src, merge, vector_len); break;
9305 case T_LONG:
9306 evpandq(dst, mask, nds, src, merge, vector_len); break;
9307 default:
9308 fatal("Unexpected type argument %s", type2name(type)); break;
9309 }
9310 }
9311
9312 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
9313 switch(masklen) {
9314 case 8:
9315 kortestbl(src1, src2);
9316 break;
9317 case 16:
9318 kortestwl(src1, src2);
9319 break;
9320 case 32:
9321 kortestdl(src1, src2);
9322 break;
9323 case 64:
9324 kortestql(src1, src2);
9325 break;
9326 default:
9327 fatal("Unexpected mask length %d", masklen);
9328 break;
9329 }
9330 }
9331
9332
9333 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9334 switch(masklen) {
9335 case 8:
9336 ktestbl(src1, src2);
9337 break;
9338 case 16:
9339 ktestwl(src1, src2);
9340 break;
9341 case 32:
9342 ktestdl(src1, src2);
9343 break;
9344 case 64:
9345 ktestql(src1, src2);
9346 break;
9347 default:
9348 fatal("Unexpected mask length %d", masklen);
9349 break;
9350 }
9351 }
9352
9353 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9354 switch(type) {
9355 case T_INT:
9356 evprold(dst, mask, src, shift, merge, vlen_enc); break;
9357 case T_LONG:
9358 evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9359 default:
9360 fatal("Unexpected type argument %s", type2name(type)); break;
9361 break;
9362 }
9363 }
9364
9365 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9366 switch(type) {
9367 case T_INT:
9368 evprord(dst, mask, src, shift, merge, vlen_enc); break;
9369 case T_LONG:
9370 evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9371 default:
9372 fatal("Unexpected type argument %s", type2name(type)); break;
9373 }
9374 }
9375
9376 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9377 switch(type) {
9378 case T_INT:
9379 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9380 case T_LONG:
9381 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9382 default:
9383 fatal("Unexpected type argument %s", type2name(type)); break;
9384 }
9385 }
9386
9387 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9388 switch(type) {
9389 case T_INT:
9390 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9391 case T_LONG:
9392 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9393 default:
9394 fatal("Unexpected type argument %s", type2name(type)); break;
9395 }
9396 }
9397
9398 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9399 assert(rscratch != noreg || always_reachable(src), "missing");
9400
9401 if (reachable(src)) {
9402 evpandq(dst, nds, as_Address(src), vector_len);
9403 } else {
9404 lea(rscratch, src);
9405 evpandq(dst, nds, Address(rscratch, 0), vector_len);
9406 }
9407 }
9408
9409 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9410 assert(rscratch != noreg || always_reachable(src), "missing");
9411
9412 if (reachable(src)) {
9413 Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9414 } else {
9415 lea(rscratch, src);
9416 Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9417 }
9418 }
9419
9420 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9421 assert(rscratch != noreg || always_reachable(src), "missing");
9422
9423 if (reachable(src)) {
9424 evporq(dst, nds, as_Address(src), vector_len);
9425 } else {
9426 lea(rscratch, src);
9427 evporq(dst, nds, Address(rscratch, 0), vector_len);
9428 }
9429 }
9430
9431 void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9432 assert(rscratch != noreg || always_reachable(src), "missing");
9433
9434 if (reachable(src)) {
9435 vpshufb(dst, nds, as_Address(src), vector_len);
9436 } else {
9437 lea(rscratch, src);
9438 vpshufb(dst, nds, Address(rscratch, 0), vector_len);
9439 }
9440 }
9441
9442 void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9443 assert(rscratch != noreg || always_reachable(src), "missing");
9444
9445 if (reachable(src)) {
9446 Assembler::vpor(dst, nds, as_Address(src), vector_len);
9447 } else {
9448 lea(rscratch, src);
9449 Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len);
9450 }
9451 }
9452
9453 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
9454 assert(rscratch != noreg || always_reachable(src3), "missing");
9455
9456 if (reachable(src3)) {
9457 vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
9458 } else {
9459 lea(rscratch, src3);
9460 vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
9461 }
9462 }
9463
9464 #ifdef COMPILER2
9465
9466 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9467 Register length, Register temp, int vec_enc) {
9468 // Computing mask for predicated vector store.
9469 movptr(temp, -1);
9470 bzhiq(temp, temp, length);
9471 kmov(mask, temp);
9472 evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9473 }
9474
9475 // Set memory operation for length "less than" 64 bytes.
9476 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9477 XMMRegister xmm, KRegister mask, Register length,
9478 Register temp, bool use64byteVector) {
9479 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9480 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9481 if (!use64byteVector) {
9482 fill32(dst, disp, xmm);
9483 subptr(length, 32 >> shift);
9484 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9485 } else {
9486 assert(MaxVectorSize == 64, "vector length != 64");
9487 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9488 }
9489 }
9490
9491
9492 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9493 XMMRegister xmm, KRegister mask, Register length,
9494 Register temp) {
9495 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9496 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9497 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9498 }
9499
9500
9501 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9502 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9503 vmovdqu(dst, xmm);
9504 }
9505
9506 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9507 fill32(Address(dst, disp), xmm);
9508 }
9509
9510 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9511 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9512 if (!use64byteVector) {
9513 fill32(dst, xmm);
9514 fill32(dst.plus_disp(32), xmm);
9515 } else {
9516 evmovdquq(dst, xmm, Assembler::AVX_512bit);
9517 }
9518 }
9519
9520 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9521 fill64(Address(dst, disp), xmm, use64byteVector);
9522 }
9523
9524 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9525 Register count, Register rtmp, XMMRegister xtmp) {
9526 Label L_exit;
9527 Label L_fill_start;
9528 Label L_fill_64_bytes;
9529 Label L_fill_96_bytes;
9530 Label L_fill_128_bytes;
9531 Label L_fill_128_bytes_loop;
9532 Label L_fill_128_loop_header;
9533 Label L_fill_128_bytes_loop_header;
9534 Label L_fill_128_bytes_loop_pre_header;
9535 Label L_fill_zmm_sequence;
9536
9537 int shift = -1;
9538 switch(type) {
9539 case T_BYTE: shift = 0;
9540 break;
9541 case T_SHORT: shift = 1;
9542 break;
9543 case T_INT: shift = 2;
9544 break;
9545 /* Uncomment when LONG fill stubs are supported.
9546 case T_LONG: shift = 3;
9547 break;
9548 */
9549 default:
9550 fatal("Unhandled type: %s\n", type2name(type));
9551 }
9552
9553 if ((CopyAVX3Threshold != 0) || (MaxVectorSize == 32)) {
9554
9555 if (MaxVectorSize == 64) {
9556 cmpq(count, CopyAVX3Threshold >> shift);
9557 jcc(Assembler::greater, L_fill_zmm_sequence);
9558 }
9559
9560 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
9561
9562 bind(L_fill_start);
9563
9564 cmpq(count, 32 >> shift);
9565 jccb(Assembler::greater, L_fill_64_bytes);
9566 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
9567 jmp(L_exit);
9568
9569 bind(L_fill_64_bytes);
9570 cmpq(count, 64 >> shift);
9571 jccb(Assembler::greater, L_fill_96_bytes);
9572 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
9573 jmp(L_exit);
9574
9575 bind(L_fill_96_bytes);
9576 cmpq(count, 96 >> shift);
9577 jccb(Assembler::greater, L_fill_128_bytes);
9578 fill64(to, 0, xtmp);
9579 subq(count, 64 >> shift);
9580 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
9581 jmp(L_exit);
9582
9583 bind(L_fill_128_bytes);
9584 cmpq(count, 128 >> shift);
9585 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
9586 fill64(to, 0, xtmp);
9587 fill32(to, 64, xtmp);
9588 subq(count, 96 >> shift);
9589 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
9590 jmp(L_exit);
9591
9592 bind(L_fill_128_bytes_loop_pre_header);
9593 {
9594 mov(rtmp, to);
9595 andq(rtmp, 31);
9596 jccb(Assembler::zero, L_fill_128_bytes_loop_header);
9597 negq(rtmp);
9598 addq(rtmp, 32);
9599 mov64(r8, -1L);
9600 bzhiq(r8, r8, rtmp);
9601 kmovql(k2, r8);
9602 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
9603 addq(to, rtmp);
9604 shrq(rtmp, shift);
9605 subq(count, rtmp);
9606 }
9607
9608 cmpq(count, 128 >> shift);
9609 jcc(Assembler::less, L_fill_start);
9610
9611 bind(L_fill_128_bytes_loop_header);
9612 subq(count, 128 >> shift);
9613
9614 align32();
9615 bind(L_fill_128_bytes_loop);
9616 fill64(to, 0, xtmp);
9617 fill64(to, 64, xtmp);
9618 addq(to, 128);
9619 subq(count, 128 >> shift);
9620 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9621
9622 addq(count, 128 >> shift);
9623 jcc(Assembler::zero, L_exit);
9624 jmp(L_fill_start);
9625 }
9626
9627 if (MaxVectorSize == 64) {
9628 // Sequence using 64 byte ZMM register.
9629 Label L_fill_128_bytes_zmm;
9630 Label L_fill_192_bytes_zmm;
9631 Label L_fill_192_bytes_loop_zmm;
9632 Label L_fill_192_bytes_loop_header_zmm;
9633 Label L_fill_192_bytes_loop_pre_header_zmm;
9634 Label L_fill_start_zmm_sequence;
9635
9636 bind(L_fill_zmm_sequence);
9637 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9638
9639 bind(L_fill_start_zmm_sequence);
9640 cmpq(count, 64 >> shift);
9641 jccb(Assembler::greater, L_fill_128_bytes_zmm);
9642 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9643 jmp(L_exit);
9644
9645 bind(L_fill_128_bytes_zmm);
9646 cmpq(count, 128 >> shift);
9647 jccb(Assembler::greater, L_fill_192_bytes_zmm);
9648 fill64(to, 0, xtmp, true);
9649 subq(count, 64 >> shift);
9650 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9651 jmp(L_exit);
9652
9653 bind(L_fill_192_bytes_zmm);
9654 cmpq(count, 192 >> shift);
9655 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9656 fill64(to, 0, xtmp, true);
9657 fill64(to, 64, xtmp, true);
9658 subq(count, 128 >> shift);
9659 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9660 jmp(L_exit);
9661
9662 bind(L_fill_192_bytes_loop_pre_header_zmm);
9663 {
9664 movq(rtmp, to);
9665 andq(rtmp, 63);
9666 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9667 negq(rtmp);
9668 addq(rtmp, 64);
9669 mov64(r8, -1L);
9670 bzhiq(r8, r8, rtmp);
9671 kmovql(k2, r8);
9672 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9673 addq(to, rtmp);
9674 shrq(rtmp, shift);
9675 subq(count, rtmp);
9676 }
9677
9678 cmpq(count, 192 >> shift);
9679 jcc(Assembler::less, L_fill_start_zmm_sequence);
9680
9681 bind(L_fill_192_bytes_loop_header_zmm);
9682 subq(count, 192 >> shift);
9683
9684 align32();
9685 bind(L_fill_192_bytes_loop_zmm);
9686 fill64(to, 0, xtmp, true);
9687 fill64(to, 64, xtmp, true);
9688 fill64(to, 128, xtmp, true);
9689 addq(to, 192);
9690 subq(count, 192 >> shift);
9691 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9692
9693 addq(count, 192 >> shift);
9694 jcc(Assembler::zero, L_exit);
9695 jmp(L_fill_start_zmm_sequence);
9696 }
9697 bind(L_exit);
9698 }
9699 #endif //COMPILER2
9700
9701
9702 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9703 Label done;
9704 cvttss2sil(dst, src);
9705 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9706 cmpl(dst, 0x80000000); // float_sign_flip
9707 jccb(Assembler::notEqual, done);
9708 subptr(rsp, 8);
9709 movflt(Address(rsp, 0), src);
9710 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9711 pop(dst);
9712 bind(done);
9713 }
9714
9715 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9716 Label done;
9717 cvttsd2sil(dst, src);
9718 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9719 cmpl(dst, 0x80000000); // float_sign_flip
9720 jccb(Assembler::notEqual, done);
9721 subptr(rsp, 8);
9722 movdbl(Address(rsp, 0), src);
9723 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9724 pop(dst);
9725 bind(done);
9726 }
9727
9728 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9729 Label done;
9730 cvttss2siq(dst, src);
9731 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9732 jccb(Assembler::notEqual, done);
9733 subptr(rsp, 8);
9734 movflt(Address(rsp, 0), src);
9735 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9736 pop(dst);
9737 bind(done);
9738 }
9739
9740 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9741 // Following code is line by line assembly translation rounding algorithm.
9742 // Please refer to java.lang.Math.round(float) algorithm for details.
9743 const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
9744 const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
9745 const int32_t FloatConsts_EXP_BIAS = 127;
9746 const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
9747 const int32_t MINUS_32 = 0xFFFFFFE0;
9748 Label L_special_case, L_block1, L_exit;
9749 movl(rtmp, FloatConsts_EXP_BIT_MASK);
9750 movdl(dst, src);
9751 andl(dst, rtmp);
9752 sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
9753 movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
9754 subl(rtmp, dst);
9755 movl(rcx, rtmp);
9756 movl(dst, MINUS_32);
9757 testl(rtmp, dst);
9758 jccb(Assembler::notEqual, L_special_case);
9759 movdl(dst, src);
9760 andl(dst, FloatConsts_SIGNIF_BIT_MASK);
9761 orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
9762 movdl(rtmp, src);
9763 testl(rtmp, rtmp);
9764 jccb(Assembler::greaterEqual, L_block1);
9765 negl(dst);
9766 bind(L_block1);
9767 sarl(dst);
9768 addl(dst, 0x1);
9769 sarl(dst, 0x1);
9770 jmp(L_exit);
9771 bind(L_special_case);
9772 convert_f2i(dst, src);
9773 bind(L_exit);
9774 }
9775
9776 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9777 // Following code is line by line assembly translation rounding algorithm.
9778 // Please refer to java.lang.Math.round(double) algorithm for details.
9779 const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
9780 const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
9781 const int64_t DoubleConsts_EXP_BIAS = 1023;
9782 const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
9783 const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
9784 Label L_special_case, L_block1, L_exit;
9785 mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
9786 movq(dst, src);
9787 andq(dst, rtmp);
9788 sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
9789 mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
9790 subq(rtmp, dst);
9791 movq(rcx, rtmp);
9792 mov64(dst, MINUS_64);
9793 testq(rtmp, dst);
9794 jccb(Assembler::notEqual, L_special_case);
9795 movq(dst, src);
9796 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
9797 andq(dst, rtmp);
9798 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
9799 orq(dst, rtmp);
9800 movq(rtmp, src);
9801 testq(rtmp, rtmp);
9802 jccb(Assembler::greaterEqual, L_block1);
9803 negq(dst);
9804 bind(L_block1);
9805 sarq(dst);
9806 addq(dst, 0x1);
9807 sarq(dst, 0x1);
9808 jmp(L_exit);
9809 bind(L_special_case);
9810 convert_d2l(dst, src);
9811 bind(L_exit);
9812 }
9813
9814 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9815 Label done;
9816 cvttsd2siq(dst, src);
9817 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9818 jccb(Assembler::notEqual, done);
9819 subptr(rsp, 8);
9820 movdbl(Address(rsp, 0), src);
9821 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9822 pop(dst);
9823 bind(done);
9824 }
9825
9826 void MacroAssembler::cache_wb(Address line)
9827 {
9828 // 64 bit cpus always support clflush
9829 bool optimized = VM_Version::supports_clflushopt();
9830 bool no_evict = VM_Version::supports_clwb();
9831
9832 // prefer clwb (writeback without evict) otherwise
9833 // prefer clflushopt (potentially parallel writeback with evict)
9834 // otherwise fallback on clflush (serial writeback with evict)
9835
9836 if (optimized) {
9837 if (no_evict) {
9838 clwb(line);
9839 } else {
9840 clflushopt(line);
9841 }
9842 } else {
9843 // no need for fence when using CLFLUSH
9844 clflush(line);
9845 }
9846 }
9847
9848 void MacroAssembler::cache_wbsync(bool is_pre)
9849 {
9850 bool optimized = VM_Version::supports_clflushopt();
9851 bool no_evict = VM_Version::supports_clwb();
9852
9853 // pick the correct implementation
9854
9855 if (!is_pre && (optimized || no_evict)) {
9856 // need an sfence for post flush when using clflushopt or clwb
9857 // otherwise no no need for any synchroniaztion
9858
9859 sfence();
9860 }
9861 }
9862
9863 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9864 switch (cond) {
9865 // Note some conditions are synonyms for others
9866 case Assembler::zero: return Assembler::notZero;
9867 case Assembler::notZero: return Assembler::zero;
9868 case Assembler::less: return Assembler::greaterEqual;
9869 case Assembler::lessEqual: return Assembler::greater;
9870 case Assembler::greater: return Assembler::lessEqual;
9871 case Assembler::greaterEqual: return Assembler::less;
9872 case Assembler::below: return Assembler::aboveEqual;
9873 case Assembler::belowEqual: return Assembler::above;
9874 case Assembler::above: return Assembler::belowEqual;
9875 case Assembler::aboveEqual: return Assembler::below;
9876 case Assembler::overflow: return Assembler::noOverflow;
9877 case Assembler::noOverflow: return Assembler::overflow;
9878 case Assembler::negative: return Assembler::positive;
9879 case Assembler::positive: return Assembler::negative;
9880 case Assembler::parity: return Assembler::noParity;
9881 case Assembler::noParity: return Assembler::parity;
9882 }
9883 ShouldNotReachHere(); return Assembler::overflow;
9884 }
9885
9886 // This is simply a call to Thread::current()
9887 void MacroAssembler::get_thread_slow(Register thread) {
9888 if (thread != rax) {
9889 push(rax);
9890 }
9891 push(rdi);
9892 push(rsi);
9893 push(rdx);
9894 push(rcx);
9895 push(r8);
9896 push(r9);
9897 push(r10);
9898 push(r11);
9899
9900 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9901
9902 pop(r11);
9903 pop(r10);
9904 pop(r9);
9905 pop(r8);
9906 pop(rcx);
9907 pop(rdx);
9908 pop(rsi);
9909 pop(rdi);
9910 if (thread != rax) {
9911 mov(thread, rax);
9912 pop(rax);
9913 }
9914 }
9915
9916 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
9917 Label L_stack_ok;
9918 if (bias == 0) {
9919 testptr(sp, 2 * wordSize - 1);
9920 } else {
9921 // lea(tmp, Address(rsp, bias);
9922 mov(tmp, sp);
9923 addptr(tmp, bias);
9924 testptr(tmp, 2 * wordSize - 1);
9925 }
9926 jcc(Assembler::equal, L_stack_ok);
9927 block_comment(msg);
9928 stop(msg);
9929 bind(L_stack_ok);
9930 }
9931
9932 // Implements fast-locking.
9933 //
9934 // obj: the object to be locked
9935 // reg_rax: rax
9936 // thread: the thread which attempts to lock obj
9937 // tmp: a temporary register
9938 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register reg_rax, Register tmp, Label& slow) {
9939 Register thread = r15_thread;
9940
9941 assert(reg_rax == rax, "");
9942 assert_different_registers(basic_lock, obj, reg_rax, thread, tmp);
9943
9944 Label push;
9945 const Register top = tmp;
9946
9947 // Preload the markWord. It is important that this is the first
9948 // instruction emitted as it is part of C1's null check semantics.
9949 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
9950
9951 if (UseObjectMonitorTable) {
9952 // Clear cache in case fast locking succeeds or we need to take the slow-path.
9953 movptr(Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))), 0);
9954 }
9955
9956 if (DiagnoseSyncOnValueBasedClasses != 0) {
9957 load_klass(tmp, obj, rscratch1);
9958 testb(Address(tmp, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
9959 jcc(Assembler::notZero, slow);
9960 }
9961
9962 // Load top.
9963 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9964
9965 // Check if the lock-stack is full.
9966 cmpl(top, LockStack::end_offset());
9967 jcc(Assembler::greaterEqual, slow);
9968
9969 // Check for recursion.
9970 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
9971 jcc(Assembler::equal, push);
9972
9973 // Check header for monitor (0b10).
9974 testptr(reg_rax, markWord::monitor_value);
9975 jcc(Assembler::notZero, slow);
9976
9977 // Try to lock. Transition lock bits 0b01 => 0b00
9978 movptr(tmp, reg_rax);
9979 andptr(tmp, ~(int32_t)markWord::unlocked_value);
9980 orptr(reg_rax, markWord::unlocked_value);
9981 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
9982 jcc(Assembler::notEqual, slow);
9983
9984 // Restore top, CAS clobbers register.
9985 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9986
9987 bind(push);
9988 // After successful lock, push object on lock-stack.
9989 movptr(Address(thread, top), obj);
9990 incrementl(top, oopSize);
9991 movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
9992 }
9993
9994 // Implements fast-unlocking.
9995 //
9996 // obj: the object to be unlocked
9997 // reg_rax: rax
9998 // thread: the thread
9999 // tmp: a temporary register
10000 void MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register tmp, Label& slow) {
10001 Register thread = r15_thread;
10002
10003 assert(reg_rax == rax, "");
10004 assert_different_registers(obj, reg_rax, thread, tmp);
10005
10006 Label unlocked, push_and_slow;
10007 const Register top = tmp;
10008
10009 // Check if obj is top of lock-stack.
10010 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10011 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
10012 jcc(Assembler::notEqual, slow);
10013
10014 // Pop lock-stack.
10015 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
10016 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10017
10018 // Check if recursive.
10019 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
10020 jcc(Assembler::equal, unlocked);
10021
10022 // Not recursive. Check header for monitor (0b10).
10023 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10024 testptr(reg_rax, markWord::monitor_value);
10025 jcc(Assembler::notZero, push_and_slow);
10026
10027 #ifdef ASSERT
10028 // Check header not unlocked (0b01).
10029 Label not_unlocked;
10030 testptr(reg_rax, markWord::unlocked_value);
10031 jcc(Assembler::zero, not_unlocked);
10032 stop("fast_unlock already unlocked");
10033 bind(not_unlocked);
10034 #endif
10035
10036 // Try to unlock. Transition lock bits 0b00 => 0b01
10037 movptr(tmp, reg_rax);
10038 orptr(tmp, markWord::unlocked_value);
10039 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10040 jcc(Assembler::equal, unlocked);
10041
10042 bind(push_and_slow);
10043 // Restore lock-stack and handle the unlock in runtime.
10044 #ifdef ASSERT
10045 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10046 movptr(Address(thread, top), obj);
10047 #endif
10048 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10049 jmp(slow);
10050
10051 bind(unlocked);
10052 }
10053
10054 // Saves legacy GPRs state on stack.
10055 void MacroAssembler::save_legacy_gprs() {
10056 subq(rsp, 16 * wordSize);
10057 movq(Address(rsp, 15 * wordSize), rax);
10058 movq(Address(rsp, 14 * wordSize), rcx);
10059 movq(Address(rsp, 13 * wordSize), rdx);
10060 movq(Address(rsp, 12 * wordSize), rbx);
10061 movq(Address(rsp, 10 * wordSize), rbp);
10062 movq(Address(rsp, 9 * wordSize), rsi);
10063 movq(Address(rsp, 8 * wordSize), rdi);
10064 movq(Address(rsp, 7 * wordSize), r8);
10065 movq(Address(rsp, 6 * wordSize), r9);
10066 movq(Address(rsp, 5 * wordSize), r10);
10067 movq(Address(rsp, 4 * wordSize), r11);
10068 movq(Address(rsp, 3 * wordSize), r12);
10069 movq(Address(rsp, 2 * wordSize), r13);
10070 movq(Address(rsp, wordSize), r14);
10071 movq(Address(rsp, 0), r15);
10072 }
10073
10074 // Resotres back legacy GPRs state from stack.
10075 void MacroAssembler::restore_legacy_gprs() {
10076 movq(r15, Address(rsp, 0));
10077 movq(r14, Address(rsp, wordSize));
10078 movq(r13, Address(rsp, 2 * wordSize));
10079 movq(r12, Address(rsp, 3 * wordSize));
10080 movq(r11, Address(rsp, 4 * wordSize));
10081 movq(r10, Address(rsp, 5 * wordSize));
10082 movq(r9, Address(rsp, 6 * wordSize));
10083 movq(r8, Address(rsp, 7 * wordSize));
10084 movq(rdi, Address(rsp, 8 * wordSize));
10085 movq(rsi, Address(rsp, 9 * wordSize));
10086 movq(rbp, Address(rsp, 10 * wordSize));
10087 movq(rbx, Address(rsp, 12 * wordSize));
10088 movq(rdx, Address(rsp, 13 * wordSize));
10089 movq(rcx, Address(rsp, 14 * wordSize));
10090 movq(rax, Address(rsp, 15 * wordSize));
10091 addq(rsp, 16 * wordSize);
10092 }
10093
10094 void MacroAssembler::load_aotrc_address(Register reg, address a) {
10095 #if INCLUDE_CDS
10096 assert(AOTRuntimeConstants::contains(a), "address out of range for data area");
10097 if (AOTCodeCache::is_on_for_dump()) {
10098 // all aotrc field addresses should be registered in the AOTCodeCache address table
10099 lea(reg, ExternalAddress(a));
10100 } else {
10101 mov64(reg, (uint64_t)a);
10102 }
10103 #else
10104 ShouldNotReachHere();
10105 #endif
10106 }
10107
10108 void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) {
10109 if (VM_Version::supports_apx_f()) {
10110 esetzucc(comparison, dst);
10111 } else {
10112 setb(comparison, dst);
10113 movzbl(dst, dst);
10114 }
10115 }