1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "code/aotCodeCache.hpp"
28 #include "code/compiledIC.hpp"
29 #include "compiler/compiler_globals.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "crc32c.h"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/barrierSetAssembler.hpp"
34 #include "gc/shared/collectedHeap.inline.hpp"
35 #include "gc/shared/tlab_globals.hpp"
36 #include "interpreter/bytecodeHistogram.hpp"
37 #include "interpreter/interpreter.hpp"
38 #include "interpreter/interpreterRuntime.hpp"
39 #include "jvm.h"
40 #include "memory/resourceArea.hpp"
41 #include "memory/universe.hpp"
42 #include "oops/accessDecorators.hpp"
43 #include "oops/compressedKlass.inline.hpp"
44 #include "oops/compressedOops.inline.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "prims/methodHandles.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/interfaceSupport.inline.hpp"
49 #include "runtime/javaThread.hpp"
50 #include "runtime/jniHandles.hpp"
51 #include "runtime/objectMonitor.hpp"
52 #include "runtime/os.hpp"
53 #include "runtime/safepoint.hpp"
54 #include "runtime/safepointMechanism.hpp"
55 #include "runtime/sharedRuntime.hpp"
56 #include "runtime/stubRoutines.hpp"
57 #include "utilities/checkedCast.hpp"
58 #include "utilities/globalDefinitions.hpp"
59 #include "utilities/macros.hpp"
60
61 #ifdef PRODUCT
62 #define BLOCK_COMMENT(str) /* nothing */
63 #define STOP(error) stop(error)
64 #else
65 #define BLOCK_COMMENT(str) block_comment(str)
66 #define STOP(error) block_comment(error); stop(error)
67 #endif
68
69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
70
71 #ifdef ASSERT
72 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
73 #endif
74
75 static const Assembler::Condition reverse[] = {
76 Assembler::noOverflow /* overflow = 0x0 */ ,
77 Assembler::overflow /* noOverflow = 0x1 */ ,
78 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
79 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
80 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
81 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
82 Assembler::above /* belowEqual = 0x6 */ ,
83 Assembler::belowEqual /* above = 0x7 */ ,
84 Assembler::positive /* negative = 0x8 */ ,
85 Assembler::negative /* positive = 0x9 */ ,
86 Assembler::noParity /* parity = 0xa */ ,
87 Assembler::parity /* noParity = 0xb */ ,
88 Assembler::greaterEqual /* less = 0xc */ ,
89 Assembler::less /* greaterEqual = 0xd */ ,
90 Assembler::greater /* lessEqual = 0xe */ ,
91 Assembler::lessEqual /* greater = 0xf, */
92
93 };
94
95
96 // Implementation of MacroAssembler
97
98 Address MacroAssembler::as_Address(AddressLiteral adr) {
99 // amd64 always does this as a pc-rel
100 // we can be absolute or disp based on the instruction type
101 // jmp/call are displacements others are absolute
102 assert(!adr.is_lval(), "must be rval");
103 assert(reachable(adr), "must be");
104 return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
105
106 }
107
108 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
109 AddressLiteral base = adr.base();
110 lea(rscratch, base);
111 Address index = adr.index();
112 assert(index._disp == 0, "must not have disp"); // maybe it can?
113 Address array(rscratch, index._index, index._scale, index._disp);
114 return array;
115 }
116
117 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
118 Label L, E;
119
120 #ifdef _WIN64
121 // Windows always allocates space for it's register args
122 assert(num_args <= 4, "only register arguments supported");
123 subq(rsp, frame::arg_reg_save_area_bytes);
124 #endif
125
126 // Align stack if necessary
127 testl(rsp, 15);
128 jcc(Assembler::zero, L);
129
130 subq(rsp, 8);
131 call(RuntimeAddress(entry_point));
132 addq(rsp, 8);
133 jmp(E);
134
135 bind(L);
136 call(RuntimeAddress(entry_point));
137
138 bind(E);
139
140 #ifdef _WIN64
141 // restore stack pointer
142 addq(rsp, frame::arg_reg_save_area_bytes);
143 #endif
144 }
145
146 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
147 assert(!src2.is_lval(), "should use cmpptr");
148 assert(rscratch != noreg || always_reachable(src2), "missing");
149
150 if (reachable(src2)) {
151 cmpq(src1, as_Address(src2));
152 } else {
153 lea(rscratch, src2);
154 Assembler::cmpq(src1, Address(rscratch, 0));
155 }
156 }
157
158 int MacroAssembler::corrected_idivq(Register reg) {
159 // Full implementation of Java ldiv and lrem; checks for special
160 // case as described in JVM spec., p.243 & p.271. The function
161 // returns the (pc) offset of the idivl instruction - may be needed
162 // for implicit exceptions.
163 //
164 // normal case special case
165 //
166 // input : rax: dividend min_long
167 // reg: divisor (may not be eax/edx) -1
168 //
169 // output: rax: quotient (= rax idiv reg) min_long
170 // rdx: remainder (= rax irem reg) 0
171 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
172 static const int64_t min_long = 0x8000000000000000;
173 Label normal_case, special_case;
174
175 // check for special case
176 cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
177 jcc(Assembler::notEqual, normal_case);
178 xorl(rdx, rdx); // prepare rdx for possible special case (where
179 // remainder = 0)
180 cmpq(reg, -1);
181 jcc(Assembler::equal, special_case);
182
183 // handle normal case
184 bind(normal_case);
185 cdqq();
186 int idivq_offset = offset();
187 idivq(reg);
188
189 // normal and special case exit
190 bind(special_case);
191
192 return idivq_offset;
193 }
194
195 void MacroAssembler::decrementq(Register reg, int value) {
196 if (value == min_jint) { subq(reg, value); return; }
197 if (value < 0) { incrementq(reg, -value); return; }
198 if (value == 0) { ; return; }
199 if (value == 1 && UseIncDec) { decq(reg) ; return; }
200 /* else */ { subq(reg, value) ; return; }
201 }
202
203 void MacroAssembler::decrementq(Address dst, int value) {
204 if (value == min_jint) { subq(dst, value); return; }
205 if (value < 0) { incrementq(dst, -value); return; }
206 if (value == 0) { ; return; }
207 if (value == 1 && UseIncDec) { decq(dst) ; return; }
208 /* else */ { subq(dst, value) ; return; }
209 }
210
211 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
212 assert(rscratch != noreg || always_reachable(dst), "missing");
213
214 if (reachable(dst)) {
215 incrementq(as_Address(dst));
216 } else {
217 lea(rscratch, dst);
218 incrementq(Address(rscratch, 0));
219 }
220 }
221
222 void MacroAssembler::incrementq(Register reg, int value) {
223 if (value == min_jint) { addq(reg, value); return; }
224 if (value < 0) { decrementq(reg, -value); return; }
225 if (value == 0) { ; return; }
226 if (value == 1 && UseIncDec) { incq(reg) ; return; }
227 /* else */ { addq(reg, value) ; return; }
228 }
229
230 void MacroAssembler::incrementq(Address dst, int value) {
231 if (value == min_jint) { addq(dst, value); return; }
232 if (value < 0) { decrementq(dst, -value); return; }
233 if (value == 0) { ; return; }
234 if (value == 1 && UseIncDec) { incq(dst) ; return; }
235 /* else */ { addq(dst, value) ; return; }
236 }
237
238 // 32bit can do a case table jump in one instruction but we no longer allow the base
239 // to be installed in the Address class
240 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
241 lea(rscratch, entry.base());
242 Address dispatch = entry.index();
243 assert(dispatch._base == noreg, "must be");
244 dispatch._base = rscratch;
245 jmp(dispatch);
246 }
247
248 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
249 ShouldNotReachHere(); // 64bit doesn't use two regs
250 cmpq(x_lo, y_lo);
251 }
252
253 void MacroAssembler::lea(Register dst, AddressLiteral src) {
254 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
255 }
256
257 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
258 lea(rscratch, adr);
259 movptr(dst, rscratch);
260 }
261
262 void MacroAssembler::leave() {
263 // %%% is this really better? Why not on 32bit too?
264 emit_int8((unsigned char)0xC9); // LEAVE
265 }
266
267 void MacroAssembler::lneg(Register hi, Register lo) {
268 ShouldNotReachHere(); // 64bit doesn't use two regs
269 negq(lo);
270 }
271
272 void MacroAssembler::movoop(Register dst, jobject obj) {
273 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
274 }
275
276 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
277 mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
278 movq(dst, rscratch);
279 }
280
281 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
282 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
283 }
284
285 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
286 mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
287 movq(dst, rscratch);
288 }
289
290 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
291 if (src.is_lval()) {
292 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
293 } else {
294 if (reachable(src)) {
295 movq(dst, as_Address(src));
296 } else {
297 lea(dst, src);
298 movq(dst, Address(dst, 0));
299 }
300 }
301 }
302
303 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
304 movq(as_Address(dst, rscratch), src);
305 }
306
307 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
308 movq(dst, as_Address(src, dst /*rscratch*/));
309 }
310
311 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
312 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
313 if (is_simm32(src)) {
314 movptr(dst, checked_cast<int32_t>(src));
315 } else {
316 mov64(rscratch, src);
317 movq(dst, rscratch);
318 }
319 }
320
321 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
322 movoop(rscratch, obj);
323 push(rscratch);
324 }
325
326 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
327 mov_metadata(rscratch, obj);
328 push(rscratch);
329 }
330
331 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
332 lea(rscratch, src);
333 if (src.is_lval()) {
334 push(rscratch);
335 } else {
336 pushq(Address(rscratch, 0));
337 }
338 }
339
340 static void pass_arg0(MacroAssembler* masm, Register arg) {
341 if (c_rarg0 != arg ) {
342 masm->mov(c_rarg0, arg);
343 }
344 }
345
346 static void pass_arg1(MacroAssembler* masm, Register arg) {
347 if (c_rarg1 != arg ) {
348 masm->mov(c_rarg1, arg);
349 }
350 }
351
352 static void pass_arg2(MacroAssembler* masm, Register arg) {
353 if (c_rarg2 != arg ) {
354 masm->mov(c_rarg2, arg);
355 }
356 }
357
358 static void pass_arg3(MacroAssembler* masm, Register arg) {
359 if (c_rarg3 != arg ) {
360 masm->mov(c_rarg3, arg);
361 }
362 }
363
364 void MacroAssembler::stop(const char* msg) {
365 if (ShowMessageBoxOnError) {
366 address rip = pc();
367 pusha(); // get regs on stack
368 lea(c_rarg1, InternalAddress(rip));
369 movq(c_rarg2, rsp); // pass pointer to regs array
370 }
371 // Skip AOT caching C strings in scratch buffer.
372 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
373 lea(c_rarg0, ExternalAddress((address) str));
374 andq(rsp, -16); // align stack as required by ABI
375 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
376 hlt();
377 }
378
379 void MacroAssembler::warn(const char* msg) {
380 push(rbp);
381 movq(rbp, rsp);
382 andq(rsp, -16); // align stack as required by push_CPU_state and call
383 push_CPU_state(); // keeps alignment at 16 bytes
384
385 #ifdef _WIN64
386 // Windows always allocates space for its register args
387 subq(rsp, frame::arg_reg_save_area_bytes);
388 #endif
389 const char* str = (code_section()->scratch_emit()) ? msg : AOTCodeCache::add_C_string(msg);
390 lea(c_rarg0, ExternalAddress((address) str));
391 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
392
393 #ifdef _WIN64
394 // restore stack pointer
395 addq(rsp, frame::arg_reg_save_area_bytes);
396 #endif
397 pop_CPU_state();
398 mov(rsp, rbp);
399 pop(rbp);
400 }
401
402 void MacroAssembler::print_state() {
403 address rip = pc();
404 pusha(); // get regs on stack
405 push(rbp);
406 movq(rbp, rsp);
407 andq(rsp, -16); // align stack as required by push_CPU_state and call
408 push_CPU_state(); // keeps alignment at 16 bytes
409
410 lea(c_rarg0, InternalAddress(rip));
411 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
412 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
413
414 pop_CPU_state();
415 mov(rsp, rbp);
416 pop(rbp);
417 popa();
418 }
419
420 #ifndef PRODUCT
421 extern "C" void findpc(intptr_t x);
422 #endif
423
424 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
425 // In order to get locks to work, we need to fake a in_VM state
426 if (ShowMessageBoxOnError) {
427 JavaThread* thread = JavaThread::current();
428 JavaThreadState saved_state = thread->thread_state();
429 thread->set_thread_state(_thread_in_vm);
430 #ifndef PRODUCT
431 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
432 ttyLocker ttyl;
433 BytecodeCounter::print();
434 }
435 #endif
436 // To see where a verify_oop failed, get $ebx+40/X for this frame.
437 // XXX correct this offset for amd64
438 // This is the value of eip which points to where verify_oop will return.
439 if (os::message_box(msg, "Execution stopped, print registers?")) {
440 print_state64(pc, regs);
441 BREAKPOINT;
442 }
443 }
444 fatal("DEBUG MESSAGE: %s", msg);
445 }
446
447 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
448 ttyLocker ttyl;
449 DebuggingContext debugging{};
450 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
451 #ifndef PRODUCT
452 tty->cr();
453 findpc(pc);
454 tty->cr();
455 #endif
456 #define PRINT_REG(rax, value) \
457 { tty->print("%s = ", #rax); os::print_location(tty, value); }
458 PRINT_REG(rax, regs[15]);
459 PRINT_REG(rbx, regs[12]);
460 PRINT_REG(rcx, regs[14]);
461 PRINT_REG(rdx, regs[13]);
462 PRINT_REG(rdi, regs[8]);
463 PRINT_REG(rsi, regs[9]);
464 PRINT_REG(rbp, regs[10]);
465 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
466 PRINT_REG(rsp, (intptr_t)(®s[16]));
467 PRINT_REG(r8 , regs[7]);
468 PRINT_REG(r9 , regs[6]);
469 PRINT_REG(r10, regs[5]);
470 PRINT_REG(r11, regs[4]);
471 PRINT_REG(r12, regs[3]);
472 PRINT_REG(r13, regs[2]);
473 PRINT_REG(r14, regs[1]);
474 PRINT_REG(r15, regs[0]);
475 #undef PRINT_REG
476 // Print some words near the top of the stack.
477 int64_t* rsp = ®s[16];
478 int64_t* dump_sp = rsp;
479 for (int col1 = 0; col1 < 8; col1++) {
480 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
481 os::print_location(tty, *dump_sp++);
482 }
483 for (int row = 0; row < 25; row++) {
484 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
485 for (int col = 0; col < 4; col++) {
486 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
487 }
488 tty->cr();
489 }
490 // Print some instructions around pc:
491 Disassembler::decode((address)pc-64, (address)pc);
492 tty->print_cr("--------");
493 Disassembler::decode((address)pc, (address)pc+32);
494 }
495
496 // The java_calling_convention describes stack locations as ideal slots on
497 // a frame with no abi restrictions. Since we must observe abi restrictions
498 // (like the placement of the register window) the slots must be biased by
499 // the following value.
500 static int reg2offset_in(VMReg r) {
501 // Account for saved rbp and return address
502 // This should really be in_preserve_stack_slots
503 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
504 }
505
506 static int reg2offset_out(VMReg r) {
507 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
508 }
509
510 // A long move
511 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
512
513 // The calling conventions assures us that each VMregpair is either
514 // all really one physical register or adjacent stack slots.
515
516 if (src.is_single_phys_reg() ) {
517 if (dst.is_single_phys_reg()) {
518 if (dst.first() != src.first()) {
519 mov(dst.first()->as_Register(), src.first()->as_Register());
520 }
521 } else {
522 assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
523 src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
524 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
525 }
526 } else if (dst.is_single_phys_reg()) {
527 assert(src.is_single_reg(), "not a stack pair");
528 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
529 } else {
530 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
531 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
532 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
533 }
534 }
535
536 // A double move
537 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
538
539 // The calling conventions assures us that each VMregpair is either
540 // all really one physical register or adjacent stack slots.
541
542 if (src.is_single_phys_reg() ) {
543 if (dst.is_single_phys_reg()) {
544 // In theory these overlap but the ordering is such that this is likely a nop
545 if ( src.first() != dst.first()) {
546 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
547 }
548 } else {
549 assert(dst.is_single_reg(), "not a stack pair");
550 movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
551 }
552 } else if (dst.is_single_phys_reg()) {
553 assert(src.is_single_reg(), "not a stack pair");
554 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
555 } else {
556 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
557 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
558 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
559 }
560 }
561
562
563 // A float arg may have to do float reg int reg conversion
564 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
565 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
566
567 // The calling conventions assures us that each VMregpair is either
568 // all really one physical register or adjacent stack slots.
569
570 if (src.first()->is_stack()) {
571 if (dst.first()->is_stack()) {
572 movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
573 movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
574 } else {
575 // stack to reg
576 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
577 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
578 }
579 } else if (dst.first()->is_stack()) {
580 // reg to stack
581 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
582 movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
583 } else {
584 // reg to reg
585 // In theory these overlap but the ordering is such that this is likely a nop
586 if ( src.first() != dst.first()) {
587 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
588 }
589 }
590 }
591
592 // On 64 bit we will store integer like items to the stack as
593 // 64 bits items (x86_32/64 abi) even though java would only store
594 // 32bits for a parameter. On 32bit it will simply be 32 bits
595 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
596 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
597 if (src.first()->is_stack()) {
598 if (dst.first()->is_stack()) {
599 // stack to stack
600 movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
601 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
602 } else {
603 // stack to reg
604 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
605 }
606 } else if (dst.first()->is_stack()) {
607 // reg to stack
608 // Do we really have to sign extend???
609 // __ movslq(src.first()->as_Register(), src.first()->as_Register());
610 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
611 } else {
612 // Do we really have to sign extend???
613 // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
614 if (dst.first() != src.first()) {
615 movq(dst.first()->as_Register(), src.first()->as_Register());
616 }
617 }
618 }
619
620 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
621 if (src.first()->is_stack()) {
622 if (dst.first()->is_stack()) {
623 // stack to stack
624 movq(rax, Address(rbp, reg2offset_in(src.first())));
625 movq(Address(rsp, reg2offset_out(dst.first())), rax);
626 } else {
627 // stack to reg
628 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
629 }
630 } else if (dst.first()->is_stack()) {
631 // reg to stack
632 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
633 } else {
634 if (dst.first() != src.first()) {
635 movq(dst.first()->as_Register(), src.first()->as_Register());
636 }
637 }
638 }
639
640 // An oop arg. Must pass a handle not the oop itself
641 void MacroAssembler::object_move(OopMap* map,
642 int oop_handle_offset,
643 int framesize_in_slots,
644 VMRegPair src,
645 VMRegPair dst,
646 bool is_receiver,
647 int* receiver_offset) {
648
649 // must pass a handle. First figure out the location we use as a handle
650
651 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
652
653 // See if oop is null if it is we need no handle
654
655 if (src.first()->is_stack()) {
656
657 // Oop is already on the stack as an argument
658 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
659 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
660 if (is_receiver) {
661 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
662 }
663
664 cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
665 lea(rHandle, Address(rbp, reg2offset_in(src.first())));
666 // conditionally move a null
667 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
668 } else {
669
670 // Oop is in a register we must store it to the space we reserve
671 // on the stack for oop_handles and pass a handle if oop is non-null
672
673 const Register rOop = src.first()->as_Register();
674 int oop_slot;
675 if (rOop == j_rarg0)
676 oop_slot = 0;
677 else if (rOop == j_rarg1)
678 oop_slot = 1;
679 else if (rOop == j_rarg2)
680 oop_slot = 2;
681 else if (rOop == j_rarg3)
682 oop_slot = 3;
683 else if (rOop == j_rarg4)
684 oop_slot = 4;
685 else {
686 assert(rOop == j_rarg5, "wrong register");
687 oop_slot = 5;
688 }
689
690 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
691 int offset = oop_slot*VMRegImpl::stack_slot_size;
692
693 map->set_oop(VMRegImpl::stack2reg(oop_slot));
694 // Store oop in handle area, may be null
695 movptr(Address(rsp, offset), rOop);
696 if (is_receiver) {
697 *receiver_offset = offset;
698 }
699
700 cmpptr(rOop, NULL_WORD);
701 lea(rHandle, Address(rsp, offset));
702 // conditionally move a null from the handle area where it was just stored
703 cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
704 }
705
706 // If arg is on the stack then place it otherwise it is already in correct reg.
707 if (dst.first()->is_stack()) {
708 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
709 }
710 }
711
712 void MacroAssembler::addptr(Register dst, int32_t imm32) {
713 addq(dst, imm32);
714 }
715
716 void MacroAssembler::addptr(Register dst, Register src) {
717 addq(dst, src);
718 }
719
720 void MacroAssembler::addptr(Address dst, Register src) {
721 addq(dst, src);
722 }
723
724 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
725 assert(rscratch != noreg || always_reachable(src), "missing");
726
727 if (reachable(src)) {
728 Assembler::addsd(dst, as_Address(src));
729 } else {
730 lea(rscratch, src);
731 Assembler::addsd(dst, Address(rscratch, 0));
732 }
733 }
734
735 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
736 assert(rscratch != noreg || always_reachable(src), "missing");
737
738 if (reachable(src)) {
739 addss(dst, as_Address(src));
740 } else {
741 lea(rscratch, src);
742 addss(dst, Address(rscratch, 0));
743 }
744 }
745
746 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
747 assert(rscratch != noreg || always_reachable(src), "missing");
748
749 if (reachable(src)) {
750 Assembler::addpd(dst, as_Address(src));
751 } else {
752 lea(rscratch, src);
753 Assembler::addpd(dst, Address(rscratch, 0));
754 }
755 }
756
757 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only.
758 // Stub code is generated once and never copied.
759 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
760 void MacroAssembler::align64() {
761 align(64, (uint)(uintptr_t)pc());
762 }
763
764 void MacroAssembler::align32() {
765 align(32, (uint)(uintptr_t)pc());
766 }
767
768 void MacroAssembler::align(uint modulus) {
769 // 8273459: Ensure alignment is possible with current segment alignment
770 assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
771 align(modulus, offset());
772 }
773
774 void MacroAssembler::align(uint modulus, uint target) {
775 if (target % modulus != 0) {
776 nop(modulus - (target % modulus));
777 }
778 }
779
780 void MacroAssembler::push_f(XMMRegister r) {
781 subptr(rsp, wordSize);
782 movflt(Address(rsp, 0), r);
783 }
784
785 void MacroAssembler::pop_f(XMMRegister r) {
786 movflt(r, Address(rsp, 0));
787 addptr(rsp, wordSize);
788 }
789
790 void MacroAssembler::push_d(XMMRegister r) {
791 subptr(rsp, 2 * wordSize);
792 movdbl(Address(rsp, 0), r);
793 }
794
795 void MacroAssembler::pop_d(XMMRegister r) {
796 movdbl(r, Address(rsp, 0));
797 addptr(rsp, 2 * Interpreter::stackElementSize);
798 }
799
800 void MacroAssembler::push_ppx(Register src) {
801 if (VM_Version::supports_apx_f()) {
802 pushp(src);
803 } else {
804 Assembler::push(src);
805 }
806 }
807
808 void MacroAssembler::pop_ppx(Register dst) {
809 if (VM_Version::supports_apx_f()) {
810 popp(dst);
811 } else {
812 Assembler::pop(dst);
813 }
814 }
815
816 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
817 // Used in sign-masking with aligned address.
818 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
819 assert(rscratch != noreg || always_reachable(src), "missing");
820
821 if (UseAVX > 2 &&
822 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
823 (dst->encoding() >= 16)) {
824 vpand(dst, dst, src, AVX_512bit, rscratch);
825 } else if (reachable(src)) {
826 Assembler::andpd(dst, as_Address(src));
827 } else {
828 lea(rscratch, src);
829 Assembler::andpd(dst, Address(rscratch, 0));
830 }
831 }
832
833 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
834 // Used in sign-masking with aligned address.
835 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
836 assert(rscratch != noreg || always_reachable(src), "missing");
837
838 if (reachable(src)) {
839 Assembler::andps(dst, as_Address(src));
840 } else {
841 lea(rscratch, src);
842 Assembler::andps(dst, Address(rscratch, 0));
843 }
844 }
845
846 void MacroAssembler::andptr(Register dst, int32_t imm32) {
847 andq(dst, imm32);
848 }
849
850 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
851 assert(rscratch != noreg || always_reachable(src), "missing");
852
853 if (reachable(src)) {
854 andq(dst, as_Address(src));
855 } else {
856 lea(rscratch, src);
857 andq(dst, Address(rscratch, 0));
858 }
859 }
860
861 void MacroAssembler::atomic_incl(Address counter_addr) {
862 lock();
863 incrementl(counter_addr);
864 }
865
866 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
867 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
868
869 if (reachable(counter_addr)) {
870 atomic_incl(as_Address(counter_addr));
871 } else {
872 lea(rscratch, counter_addr);
873 atomic_incl(Address(rscratch, 0));
874 }
875 }
876
877 void MacroAssembler::atomic_incq(Address counter_addr) {
878 lock();
879 incrementq(counter_addr);
880 }
881
882 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
883 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
884
885 if (reachable(counter_addr)) {
886 atomic_incq(as_Address(counter_addr));
887 } else {
888 lea(rscratch, counter_addr);
889 atomic_incq(Address(rscratch, 0));
890 }
891 }
892
893 // Writes to stack successive pages until offset reached to check for
894 // stack overflow + shadow pages. This clobbers tmp.
895 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
896 movptr(tmp, rsp);
897 // Bang stack for total size given plus shadow page size.
898 // Bang one page at a time because large size can bang beyond yellow and
899 // red zones.
900 Label loop;
901 bind(loop);
902 movl(Address(tmp, (-(int)os::vm_page_size())), size );
903 subptr(tmp, (int)os::vm_page_size());
904 subl(size, (int)os::vm_page_size());
905 jcc(Assembler::greater, loop);
906
907 // Bang down shadow pages too.
908 // At this point, (tmp-0) is the last address touched, so don't
909 // touch it again. (It was touched as (tmp-pagesize) but then tmp
910 // was post-decremented.) Skip this address by starting at i=1, and
911 // touch a few more pages below. N.B. It is important to touch all
912 // the way down including all pages in the shadow zone.
913 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
914 // this could be any sized move but this is can be a debugging crumb
915 // so the bigger the better.
916 movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
917 }
918 }
919
920 void MacroAssembler::reserved_stack_check() {
921 // testing if reserved zone needs to be enabled
922 Label no_reserved_zone_enabling;
923
924 cmpptr(rsp, Address(r15_thread, JavaThread::reserved_stack_activation_offset()));
925 jcc(Assembler::below, no_reserved_zone_enabling);
926
927 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), r15_thread);
928 jump(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
929 should_not_reach_here();
930
931 bind(no_reserved_zone_enabling);
932 }
933
934 void MacroAssembler::c2bool(Register x) {
935 // implements x == 0 ? 0 : 1
936 // note: must only look at least-significant byte of x
937 // since C-style booleans are stored in one byte
938 // only! (was bug)
939 andl(x, 0xFF);
940 setb(Assembler::notZero, x);
941 }
942
943 // Wouldn't need if AddressLiteral version had new name
944 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
945 Assembler::call(L, rtype);
946 }
947
948 void MacroAssembler::call(Register entry) {
949 Assembler::call(entry);
950 }
951
952 void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
953 assert(rscratch != noreg || always_reachable(entry), "missing");
954
955 if (reachable(entry)) {
956 Assembler::call_literal(entry.target(), entry.rspec());
957 } else {
958 lea(rscratch, entry);
959 Assembler::call(rscratch);
960 }
961 }
962
963 void MacroAssembler::ic_call(address entry, jint method_index) {
964 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
965 // Needs full 64-bit immediate for later patching.
966 Assembler::mov64(rax, (int64_t)Universe::non_oop_word());
967 call(AddressLiteral(entry, rh));
968 }
969
970 int MacroAssembler::ic_check_size() {
971 return UseCompactObjectHeaders ? 17 : 14;
972 }
973
974 int MacroAssembler::ic_check(int end_alignment) {
975 Register receiver = j_rarg0;
976 Register data = rax;
977 Register temp = rscratch1;
978
979 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
980 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
981 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
982 // before the inline cache check here, and not after
983 align(end_alignment, offset() + ic_check_size());
984
985 int uep_offset = offset();
986
987 if (UseCompactObjectHeaders) {
988 load_narrow_klass_compact(temp, receiver);
989 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
990 } else {
991 movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
992 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
993 }
994
995 // if inline cache check fails, then jump to runtime routine
996 jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
997 assert((offset() % end_alignment) == 0, "Misaligned verified entry point (%d, %d, %d)", uep_offset, offset(), end_alignment);
998
999 return uep_offset;
1000 }
1001
1002 void MacroAssembler::emit_static_call_stub() {
1003 // Static stub relocation also tags the Method* in the code-stream.
1004 mov_metadata(rbx, (Metadata*) nullptr); // Method is zapped till fixup time.
1005 // This is recognized as unresolved by relocs/nativeinst/ic code.
1006 jump(RuntimeAddress(pc()));
1007 }
1008
1009 // Implementation of call_VM versions
1010
1011 void MacroAssembler::call_VM(Register oop_result,
1012 address entry_point,
1013 bool check_exceptions) {
1014 Label C, E;
1015 call(C, relocInfo::none);
1016 jmp(E);
1017
1018 bind(C);
1019 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1020 ret(0);
1021
1022 bind(E);
1023 }
1024
1025 void MacroAssembler::call_VM(Register oop_result,
1026 address entry_point,
1027 Register arg_1,
1028 bool check_exceptions) {
1029 Label C, E;
1030 call(C, relocInfo::none);
1031 jmp(E);
1032
1033 bind(C);
1034 pass_arg1(this, arg_1);
1035 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1036 ret(0);
1037
1038 bind(E);
1039 }
1040
1041 void MacroAssembler::call_VM(Register oop_result,
1042 address entry_point,
1043 Register arg_1,
1044 Register arg_2,
1045 bool check_exceptions) {
1046 Label C, E;
1047 call(C, relocInfo::none);
1048 jmp(E);
1049
1050 bind(C);
1051
1052 assert_different_registers(arg_1, c_rarg2);
1053
1054 pass_arg2(this, arg_2);
1055 pass_arg1(this, arg_1);
1056 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1057 ret(0);
1058
1059 bind(E);
1060 }
1061
1062 void MacroAssembler::call_VM(Register oop_result,
1063 address entry_point,
1064 Register arg_1,
1065 Register arg_2,
1066 Register arg_3,
1067 bool check_exceptions) {
1068 Label C, E;
1069 call(C, relocInfo::none);
1070 jmp(E);
1071
1072 bind(C);
1073
1074 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1075 assert_different_registers(arg_2, c_rarg3);
1076 pass_arg3(this, arg_3);
1077 pass_arg2(this, arg_2);
1078 pass_arg1(this, arg_1);
1079 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1080 ret(0);
1081
1082 bind(E);
1083 }
1084
1085 void MacroAssembler::call_VM(Register oop_result,
1086 Register last_java_sp,
1087 address entry_point,
1088 int number_of_arguments,
1089 bool check_exceptions) {
1090 call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1091 }
1092
1093 void MacroAssembler::call_VM(Register oop_result,
1094 Register last_java_sp,
1095 address entry_point,
1096 Register arg_1,
1097 bool check_exceptions) {
1098 pass_arg1(this, arg_1);
1099 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1100 }
1101
1102 void MacroAssembler::call_VM(Register oop_result,
1103 Register last_java_sp,
1104 address entry_point,
1105 Register arg_1,
1106 Register arg_2,
1107 bool check_exceptions) {
1108
1109 assert_different_registers(arg_1, c_rarg2);
1110 pass_arg2(this, arg_2);
1111 pass_arg1(this, arg_1);
1112 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1113 }
1114
1115 void MacroAssembler::call_VM(Register oop_result,
1116 Register last_java_sp,
1117 address entry_point,
1118 Register arg_1,
1119 Register arg_2,
1120 Register arg_3,
1121 bool check_exceptions) {
1122 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1123 assert_different_registers(arg_2, c_rarg3);
1124 pass_arg3(this, arg_3);
1125 pass_arg2(this, arg_2);
1126 pass_arg1(this, arg_1);
1127 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1128 }
1129
1130 void MacroAssembler::super_call_VM(Register oop_result,
1131 Register last_java_sp,
1132 address entry_point,
1133 int number_of_arguments,
1134 bool check_exceptions) {
1135 MacroAssembler::call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1136 }
1137
1138 void MacroAssembler::super_call_VM(Register oop_result,
1139 Register last_java_sp,
1140 address entry_point,
1141 Register arg_1,
1142 bool check_exceptions) {
1143 pass_arg1(this, arg_1);
1144 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1145 }
1146
1147 void MacroAssembler::super_call_VM(Register oop_result,
1148 Register last_java_sp,
1149 address entry_point,
1150 Register arg_1,
1151 Register arg_2,
1152 bool check_exceptions) {
1153
1154 assert_different_registers(arg_1, c_rarg2);
1155 pass_arg2(this, arg_2);
1156 pass_arg1(this, arg_1);
1157 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1158 }
1159
1160 void MacroAssembler::super_call_VM(Register oop_result,
1161 Register last_java_sp,
1162 address entry_point,
1163 Register arg_1,
1164 Register arg_2,
1165 Register arg_3,
1166 bool check_exceptions) {
1167 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1168 assert_different_registers(arg_2, c_rarg3);
1169 pass_arg3(this, arg_3);
1170 pass_arg2(this, arg_2);
1171 pass_arg1(this, arg_1);
1172 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1173 }
1174
1175 void MacroAssembler::call_VM_base(Register oop_result,
1176 Register last_java_sp,
1177 address entry_point,
1178 int number_of_arguments,
1179 bool check_exceptions) {
1180 Register java_thread = r15_thread;
1181
1182 // determine last_java_sp register
1183 if (!last_java_sp->is_valid()) {
1184 last_java_sp = rsp;
1185 }
1186 // debugging support
1187 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1188 #ifdef ASSERT
1189 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1190 // r12 is the heapbase.
1191 if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
1192 #endif // ASSERT
1193
1194 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1195 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1196
1197 // push java thread (becomes first argument of C function)
1198
1199 mov(c_rarg0, r15_thread);
1200
1201 // set last Java frame before call
1202 assert(last_java_sp != rbp, "can't use ebp/rbp");
1203
1204 // Only interpreter should have to set fp
1205 set_last_Java_frame(last_java_sp, rbp, nullptr, rscratch1);
1206
1207 // do the call, remove parameters
1208 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1209
1210 #ifdef ASSERT
1211 // Check that thread register is not clobbered.
1212 guarantee(java_thread != rax, "change this code");
1213 push(rax);
1214 { Label L;
1215 get_thread_slow(rax);
1216 cmpptr(java_thread, rax);
1217 jcc(Assembler::equal, L);
1218 STOP("MacroAssembler::call_VM_base: java_thread not callee saved?");
1219 bind(L);
1220 }
1221 pop(rax);
1222 #endif
1223
1224 // reset last Java frame
1225 // Only interpreter should have to clear fp
1226 reset_last_Java_frame(true);
1227
1228 // C++ interp handles this in the interpreter
1229 check_and_handle_popframe();
1230 check_and_handle_earlyret();
1231
1232 if (check_exceptions) {
1233 // check for pending exceptions (java_thread is set upon return)
1234 cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1235 // This used to conditionally jump to forward_exception however it is
1236 // possible if we relocate that the branch will not reach. So we must jump
1237 // around so we can always reach
1238
1239 Label ok;
1240 jcc(Assembler::equal, ok);
1241 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1242 bind(ok);
1243 }
1244
1245 // get oop result if there is one and reset the value in the thread
1246 if (oop_result->is_valid()) {
1247 get_vm_result_oop(oop_result);
1248 }
1249 }
1250
1251 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1252 // Calculate the value for last_Java_sp somewhat subtle.
1253 // call_VM does an intermediate call which places a return address on
1254 // the stack just under the stack pointer as the user finished with it.
1255 // This allows use to retrieve last_Java_pc from last_Java_sp[-1].
1256
1257 // We've pushed one address, correct last_Java_sp
1258 lea(rax, Address(rsp, wordSize));
1259
1260 call_VM_base(oop_result, rax, entry_point, number_of_arguments, check_exceptions);
1261 }
1262
1263 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1264 void MacroAssembler::call_VM_leaf0(address entry_point) {
1265 MacroAssembler::call_VM_leaf_base(entry_point, 0);
1266 }
1267
1268 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1269 call_VM_leaf_base(entry_point, number_of_arguments);
1270 }
1271
1272 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1273 pass_arg0(this, arg_0);
1274 call_VM_leaf(entry_point, 1);
1275 }
1276
1277 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1278
1279 assert_different_registers(arg_0, c_rarg1);
1280 pass_arg1(this, arg_1);
1281 pass_arg0(this, arg_0);
1282 call_VM_leaf(entry_point, 2);
1283 }
1284
1285 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1286 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1287 assert_different_registers(arg_1, c_rarg2);
1288 pass_arg2(this, arg_2);
1289 pass_arg1(this, arg_1);
1290 pass_arg0(this, arg_0);
1291 call_VM_leaf(entry_point, 3);
1292 }
1293
1294 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1295 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1296 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1297 assert_different_registers(arg_2, c_rarg3);
1298 pass_arg3(this, arg_3);
1299 pass_arg2(this, arg_2);
1300 pass_arg1(this, arg_1);
1301 pass_arg0(this, arg_0);
1302 call_VM_leaf(entry_point, 3);
1303 }
1304
1305 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1306 pass_arg0(this, arg_0);
1307 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1308 }
1309
1310 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1311 assert_different_registers(arg_0, c_rarg1);
1312 pass_arg1(this, arg_1);
1313 pass_arg0(this, arg_0);
1314 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1315 }
1316
1317 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1318 assert_different_registers(arg_0, c_rarg1, c_rarg2);
1319 assert_different_registers(arg_1, c_rarg2);
1320 pass_arg2(this, arg_2);
1321 pass_arg1(this, arg_1);
1322 pass_arg0(this, arg_0);
1323 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1324 }
1325
1326 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1327 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
1328 assert_different_registers(arg_1, c_rarg2, c_rarg3);
1329 assert_different_registers(arg_2, c_rarg3);
1330 pass_arg3(this, arg_3);
1331 pass_arg2(this, arg_2);
1332 pass_arg1(this, arg_1);
1333 pass_arg0(this, arg_0);
1334 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1335 }
1336
1337 void MacroAssembler::get_vm_result_oop(Register oop_result) {
1338 movptr(oop_result, Address(r15_thread, JavaThread::vm_result_oop_offset()));
1339 movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
1340 verify_oop_msg(oop_result, "broken oop in call_VM_base");
1341 }
1342
1343 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
1344 movptr(metadata_result, Address(r15_thread, JavaThread::vm_result_metadata_offset()));
1345 movptr(Address(r15_thread, JavaThread::vm_result_metadata_offset()), NULL_WORD);
1346 }
1347
1348 void MacroAssembler::check_and_handle_earlyret() {
1349 }
1350
1351 void MacroAssembler::check_and_handle_popframe() {
1352 }
1353
1354 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1355 assert(rscratch != noreg || always_reachable(src1), "missing");
1356
1357 if (reachable(src1)) {
1358 cmpl(as_Address(src1), imm);
1359 } else {
1360 lea(rscratch, src1);
1361 cmpl(Address(rscratch, 0), imm);
1362 }
1363 }
1364
1365 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1366 assert(!src2.is_lval(), "use cmpptr");
1367 assert(rscratch != noreg || always_reachable(src2), "missing");
1368
1369 if (reachable(src2)) {
1370 cmpl(src1, as_Address(src2));
1371 } else {
1372 lea(rscratch, src2);
1373 cmpl(src1, Address(rscratch, 0));
1374 }
1375 }
1376
1377 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1378 Assembler::cmpl(src1, imm);
1379 }
1380
1381 void MacroAssembler::cmp32(Register src1, Address src2) {
1382 Assembler::cmpl(src1, src2);
1383 }
1384
1385 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1386 ucomisd(opr1, opr2);
1387
1388 Label L;
1389 if (unordered_is_less) {
1390 movl(dst, -1);
1391 jcc(Assembler::parity, L);
1392 jcc(Assembler::below , L);
1393 movl(dst, 0);
1394 jcc(Assembler::equal , L);
1395 increment(dst);
1396 } else { // unordered is greater
1397 movl(dst, 1);
1398 jcc(Assembler::parity, L);
1399 jcc(Assembler::above , L);
1400 movl(dst, 0);
1401 jcc(Assembler::equal , L);
1402 decrementl(dst);
1403 }
1404 bind(L);
1405 }
1406
1407 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1408 ucomiss(opr1, opr2);
1409
1410 Label L;
1411 if (unordered_is_less) {
1412 movl(dst, -1);
1413 jcc(Assembler::parity, L);
1414 jcc(Assembler::below , L);
1415 movl(dst, 0);
1416 jcc(Assembler::equal , L);
1417 increment(dst);
1418 } else { // unordered is greater
1419 movl(dst, 1);
1420 jcc(Assembler::parity, L);
1421 jcc(Assembler::above , L);
1422 movl(dst, 0);
1423 jcc(Assembler::equal , L);
1424 decrementl(dst);
1425 }
1426 bind(L);
1427 }
1428
1429
1430 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1431 assert(rscratch != noreg || always_reachable(src1), "missing");
1432
1433 if (reachable(src1)) {
1434 cmpb(as_Address(src1), imm);
1435 } else {
1436 lea(rscratch, src1);
1437 cmpb(Address(rscratch, 0), imm);
1438 }
1439 }
1440
1441 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1442 assert(rscratch != noreg || always_reachable(src2), "missing");
1443
1444 if (src2.is_lval()) {
1445 movptr(rscratch, src2);
1446 Assembler::cmpq(src1, rscratch);
1447 } else if (reachable(src2)) {
1448 cmpq(src1, as_Address(src2));
1449 } else {
1450 lea(rscratch, src2);
1451 Assembler::cmpq(src1, Address(rscratch, 0));
1452 }
1453 }
1454
1455 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1456 assert(src2.is_lval(), "not a mem-mem compare");
1457 // moves src2's literal address
1458 movptr(rscratch, src2);
1459 Assembler::cmpq(src1, rscratch);
1460 }
1461
1462 void MacroAssembler::cmpoop(Register src1, Register src2) {
1463 cmpptr(src1, src2);
1464 }
1465
1466 void MacroAssembler::cmpoop(Register src1, Address src2) {
1467 cmpptr(src1, src2);
1468 }
1469
1470 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1471 movoop(rscratch, src2);
1472 cmpptr(src1, rscratch);
1473 }
1474
1475 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1476 assert(rscratch != noreg || always_reachable(adr), "missing");
1477
1478 if (reachable(adr)) {
1479 lock();
1480 cmpxchgptr(reg, as_Address(adr));
1481 } else {
1482 lea(rscratch, adr);
1483 lock();
1484 cmpxchgptr(reg, Address(rscratch, 0));
1485 }
1486 }
1487
1488 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1489 cmpxchgq(reg, adr);
1490 }
1491
1492 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1493 assert(rscratch != noreg || always_reachable(src), "missing");
1494
1495 if (reachable(src)) {
1496 Assembler::comisd(dst, as_Address(src));
1497 } else {
1498 lea(rscratch, src);
1499 Assembler::comisd(dst, Address(rscratch, 0));
1500 }
1501 }
1502
1503 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1504 assert(rscratch != noreg || always_reachable(src), "missing");
1505
1506 if (reachable(src)) {
1507 Assembler::comiss(dst, as_Address(src));
1508 } else {
1509 lea(rscratch, src);
1510 Assembler::comiss(dst, Address(rscratch, 0));
1511 }
1512 }
1513
1514
1515 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1516 assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1517
1518 Condition negated_cond = negate_condition(cond);
1519 Label L;
1520 jcc(negated_cond, L);
1521 pushf(); // Preserve flags
1522 atomic_incl(counter_addr, rscratch);
1523 popf();
1524 bind(L);
1525 }
1526
1527 int MacroAssembler::corrected_idivl(Register reg) {
1528 // Full implementation of Java idiv and irem; checks for
1529 // special case as described in JVM spec., p.243 & p.271.
1530 // The function returns the (pc) offset of the idivl
1531 // instruction - may be needed for implicit exceptions.
1532 //
1533 // normal case special case
1534 //
1535 // input : rax,: dividend min_int
1536 // reg: divisor (may not be rax,/rdx) -1
1537 //
1538 // output: rax,: quotient (= rax, idiv reg) min_int
1539 // rdx: remainder (= rax, irem reg) 0
1540 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1541 const int min_int = 0x80000000;
1542 Label normal_case, special_case;
1543
1544 // check for special case
1545 cmpl(rax, min_int);
1546 jcc(Assembler::notEqual, normal_case);
1547 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1548 cmpl(reg, -1);
1549 jcc(Assembler::equal, special_case);
1550
1551 // handle normal case
1552 bind(normal_case);
1553 cdql();
1554 int idivl_offset = offset();
1555 idivl(reg);
1556
1557 // normal and special case exit
1558 bind(special_case);
1559
1560 return idivl_offset;
1561 }
1562
1563
1564
1565 void MacroAssembler::decrementl(Register reg, int value) {
1566 if (value == min_jint) {subl(reg, value) ; return; }
1567 if (value < 0) { incrementl(reg, -value); return; }
1568 if (value == 0) { ; return; }
1569 if (value == 1 && UseIncDec) { decl(reg) ; return; }
1570 /* else */ { subl(reg, value) ; return; }
1571 }
1572
1573 void MacroAssembler::decrementl(Address dst, int value) {
1574 if (value == min_jint) {subl(dst, value) ; return; }
1575 if (value < 0) { incrementl(dst, -value); return; }
1576 if (value == 0) { ; return; }
1577 if (value == 1 && UseIncDec) { decl(dst) ; return; }
1578 /* else */ { subl(dst, value) ; return; }
1579 }
1580
1581 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1582 assert(shift_value > 0, "illegal shift value");
1583 Label _is_positive;
1584 testl (reg, reg);
1585 jcc (Assembler::positive, _is_positive);
1586 int offset = (1 << shift_value) - 1 ;
1587
1588 if (offset == 1) {
1589 incrementl(reg);
1590 } else {
1591 addl(reg, offset);
1592 }
1593
1594 bind (_is_positive);
1595 sarl(reg, shift_value);
1596 }
1597
1598 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1599 assert(rscratch != noreg || always_reachable(src), "missing");
1600
1601 if (reachable(src)) {
1602 Assembler::divsd(dst, as_Address(src));
1603 } else {
1604 lea(rscratch, src);
1605 Assembler::divsd(dst, Address(rscratch, 0));
1606 }
1607 }
1608
1609 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1610 assert(rscratch != noreg || always_reachable(src), "missing");
1611
1612 if (reachable(src)) {
1613 Assembler::divss(dst, as_Address(src));
1614 } else {
1615 lea(rscratch, src);
1616 Assembler::divss(dst, Address(rscratch, 0));
1617 }
1618 }
1619
1620 void MacroAssembler::enter() {
1621 push(rbp);
1622 mov(rbp, rsp);
1623 }
1624
1625 void MacroAssembler::post_call_nop() {
1626 if (!Continuations::enabled()) {
1627 return;
1628 }
1629 InstructionMark im(this);
1630 relocate(post_call_nop_Relocation::spec());
1631 InlineSkippedInstructionsCounter skipCounter(this);
1632 emit_int8((uint8_t)0x0f);
1633 emit_int8((uint8_t)0x1f);
1634 emit_int8((uint8_t)0x84);
1635 emit_int8((uint8_t)0x00);
1636 emit_int32(0x00);
1637 }
1638
1639 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1640 assert(rscratch != noreg || always_reachable(src), "missing");
1641 if (reachable(src)) {
1642 Assembler::mulpd(dst, as_Address(src));
1643 } else {
1644 lea(rscratch, src);
1645 Assembler::mulpd(dst, Address(rscratch, 0));
1646 }
1647 }
1648
1649 // dst = c = a * b + c
1650 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1651 Assembler::vfmadd231sd(c, a, b);
1652 if (dst != c) {
1653 movdbl(dst, c);
1654 }
1655 }
1656
1657 // dst = c = a * b + c
1658 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
1659 Assembler::vfmadd231ss(c, a, b);
1660 if (dst != c) {
1661 movflt(dst, c);
1662 }
1663 }
1664
1665 // dst = c = a * b + c
1666 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1667 Assembler::vfmadd231pd(c, a, b, vector_len);
1668 if (dst != c) {
1669 vmovdqu(dst, c);
1670 }
1671 }
1672
1673 // dst = c = a * b + c
1674 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
1675 Assembler::vfmadd231ps(c, a, b, vector_len);
1676 if (dst != c) {
1677 vmovdqu(dst, c);
1678 }
1679 }
1680
1681 // dst = c = a * b + c
1682 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1683 Assembler::vfmadd231pd(c, a, b, vector_len);
1684 if (dst != c) {
1685 vmovdqu(dst, c);
1686 }
1687 }
1688
1689 // dst = c = a * b + c
1690 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
1691 Assembler::vfmadd231ps(c, a, b, vector_len);
1692 if (dst != c) {
1693 vmovdqu(dst, c);
1694 }
1695 }
1696
1697 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
1698 assert(rscratch != noreg || always_reachable(dst), "missing");
1699
1700 if (reachable(dst)) {
1701 incrementl(as_Address(dst));
1702 } else {
1703 lea(rscratch, dst);
1704 incrementl(Address(rscratch, 0));
1705 }
1706 }
1707
1708 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
1709 incrementl(as_Address(dst, rscratch));
1710 }
1711
1712 void MacroAssembler::incrementl(Register reg, int value) {
1713 if (value == min_jint) {addl(reg, value) ; return; }
1714 if (value < 0) { decrementl(reg, -value); return; }
1715 if (value == 0) { ; return; }
1716 if (value == 1 && UseIncDec) { incl(reg) ; return; }
1717 /* else */ { addl(reg, value) ; return; }
1718 }
1719
1720 void MacroAssembler::incrementl(Address dst, int value) {
1721 if (value == min_jint) {addl(dst, value) ; return; }
1722 if (value < 0) { decrementl(dst, -value); return; }
1723 if (value == 0) { ; return; }
1724 if (value == 1 && UseIncDec) { incl(dst) ; return; }
1725 /* else */ { addl(dst, value) ; return; }
1726 }
1727
1728 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
1729 assert(rscratch != noreg || always_reachable(dst), "missing");
1730 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump");
1731 if (reachable(dst)) {
1732 jmp_literal(dst.target(), dst.rspec());
1733 } else {
1734 lea(rscratch, dst);
1735 jmp(rscratch);
1736 }
1737 }
1738
1739 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
1740 assert(rscratch != noreg || always_reachable(dst), "missing");
1741 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump_cc");
1742 if (reachable(dst)) {
1743 InstructionMark im(this);
1744 relocate(dst.reloc());
1745 const int short_size = 2;
1746 const int long_size = 6;
1747 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
1748 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
1749 // 0111 tttn #8-bit disp
1750 emit_int8(0x70 | cc);
1751 emit_int8((offs - short_size) & 0xFF);
1752 } else {
1753 // 0000 1111 1000 tttn #32-bit disp
1754 emit_int8(0x0F);
1755 emit_int8((unsigned char)(0x80 | cc));
1756 emit_int32(offs - long_size);
1757 }
1758 } else {
1759 #ifdef ASSERT
1760 warning("reversing conditional branch");
1761 #endif /* ASSERT */
1762 Label skip;
1763 jccb(reverse[cc], skip);
1764 lea(rscratch, dst);
1765 Assembler::jmp(rscratch);
1766 bind(skip);
1767 }
1768 }
1769
1770 void MacroAssembler::cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch) {
1771 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
1772 assert(rscratch != noreg || always_reachable(mxcsr_std), "missing");
1773
1774 stmxcsr(mxcsr_save);
1775 movl(tmp, mxcsr_save);
1776 if (EnableX86ECoreOpts) {
1777 // The mxcsr_std has status bits set for performance on ECore
1778 orl(tmp, 0x003f);
1779 } else {
1780 // Mask out status bits (only check control and mask bits)
1781 andl(tmp, 0xFFC0);
1782 }
1783 cmp32(tmp, mxcsr_std, rscratch);
1784 }
1785
1786 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
1787 assert(rscratch != noreg || always_reachable(src), "missing");
1788
1789 if (reachable(src)) {
1790 Assembler::ldmxcsr(as_Address(src));
1791 } else {
1792 lea(rscratch, src);
1793 Assembler::ldmxcsr(Address(rscratch, 0));
1794 }
1795 }
1796
1797 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1798 int off = offset();
1799 movsbl(dst, src); // movsxb
1800 return off;
1801 }
1802
1803 // Note: load_signed_short used to be called load_signed_word.
1804 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
1805 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
1806 // The term "word" in HotSpot means a 32- or 64-bit machine word.
1807 int MacroAssembler::load_signed_short(Register dst, Address src) {
1808 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
1809 // version but this is what 64bit has always done. This seems to imply
1810 // that users are only using 32bits worth.
1811 int off = offset();
1812 movswl(dst, src); // movsxw
1813 return off;
1814 }
1815
1816 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1817 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1818 // and "3.9 Partial Register Penalties", p. 22).
1819 int off = offset();
1820 movzbl(dst, src); // movzxb
1821 return off;
1822 }
1823
1824 // Note: load_unsigned_short used to be called load_unsigned_word.
1825 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1826 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
1827 // and "3.9 Partial Register Penalties", p. 22).
1828 int off = offset();
1829 movzwl(dst, src); // movzxw
1830 return off;
1831 }
1832
1833 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1834 switch (size_in_bytes) {
1835 case 8: movq(dst, src); break;
1836 case 4: movl(dst, src); break;
1837 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1838 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1839 default: ShouldNotReachHere();
1840 }
1841 }
1842
1843 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1844 switch (size_in_bytes) {
1845 case 8: movq(dst, src); break;
1846 case 4: movl(dst, src); break;
1847 case 2: movw(dst, src); break;
1848 case 1: movb(dst, src); break;
1849 default: ShouldNotReachHere();
1850 }
1851 }
1852
1853 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
1854 assert(rscratch != noreg || always_reachable(dst), "missing");
1855
1856 if (reachable(dst)) {
1857 movl(as_Address(dst), src);
1858 } else {
1859 lea(rscratch, dst);
1860 movl(Address(rscratch, 0), src);
1861 }
1862 }
1863
1864 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
1865 if (reachable(src)) {
1866 movl(dst, as_Address(src));
1867 } else {
1868 lea(dst, src);
1869 movl(dst, Address(dst, 0));
1870 }
1871 }
1872
1873 // C++ bool manipulation
1874
1875 void MacroAssembler::movbool(Register dst, Address src) {
1876 if(sizeof(bool) == 1)
1877 movb(dst, src);
1878 else if(sizeof(bool) == 2)
1879 movw(dst, src);
1880 else if(sizeof(bool) == 4)
1881 movl(dst, src);
1882 else
1883 // unsupported
1884 ShouldNotReachHere();
1885 }
1886
1887 void MacroAssembler::movbool(Address dst, bool boolconst) {
1888 if(sizeof(bool) == 1)
1889 movb(dst, (int) boolconst);
1890 else if(sizeof(bool) == 2)
1891 movw(dst, (int) boolconst);
1892 else if(sizeof(bool) == 4)
1893 movl(dst, (int) boolconst);
1894 else
1895 // unsupported
1896 ShouldNotReachHere();
1897 }
1898
1899 void MacroAssembler::movbool(Address dst, Register src) {
1900 if(sizeof(bool) == 1)
1901 movb(dst, src);
1902 else if(sizeof(bool) == 2)
1903 movw(dst, src);
1904 else if(sizeof(bool) == 4)
1905 movl(dst, src);
1906 else
1907 // unsupported
1908 ShouldNotReachHere();
1909 }
1910
1911 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1912 assert(rscratch != noreg || always_reachable(src), "missing");
1913
1914 if (reachable(src)) {
1915 movdl(dst, as_Address(src));
1916 } else {
1917 lea(rscratch, src);
1918 movdl(dst, Address(rscratch, 0));
1919 }
1920 }
1921
1922 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
1923 assert(rscratch != noreg || always_reachable(src), "missing");
1924
1925 if (reachable(src)) {
1926 movq(dst, as_Address(src));
1927 } else {
1928 lea(rscratch, src);
1929 movq(dst, Address(rscratch, 0));
1930 }
1931 }
1932
1933 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
1934 assert(rscratch != noreg || always_reachable(src), "missing");
1935
1936 if (reachable(src)) {
1937 if (UseXmmLoadAndClearUpper) {
1938 movsd (dst, as_Address(src));
1939 } else {
1940 movlpd(dst, as_Address(src));
1941 }
1942 } else {
1943 lea(rscratch, src);
1944 if (UseXmmLoadAndClearUpper) {
1945 movsd (dst, Address(rscratch, 0));
1946 } else {
1947 movlpd(dst, Address(rscratch, 0));
1948 }
1949 }
1950 }
1951
1952 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
1953 assert(rscratch != noreg || always_reachable(src), "missing");
1954
1955 if (reachable(src)) {
1956 movss(dst, as_Address(src));
1957 } else {
1958 lea(rscratch, src);
1959 movss(dst, Address(rscratch, 0));
1960 }
1961 }
1962
1963 void MacroAssembler::movhlf(XMMRegister dst, XMMRegister src, Register rscratch) {
1964 if (VM_Version::supports_avx10_2()) {
1965 evmovw(dst, src);
1966 } else {
1967 assert(rscratch != noreg, "missing");
1968 evmovw(rscratch, src);
1969 evmovw(dst, rscratch);
1970 }
1971 }
1972
1973 void MacroAssembler::mov64(Register dst, int64_t imm64) {
1974 if (is_uimm32(imm64)) {
1975 movl(dst, checked_cast<uint32_t>(imm64));
1976 } else if (is_simm32(imm64)) {
1977 movq(dst, checked_cast<int32_t>(imm64));
1978 } else {
1979 Assembler::mov64(dst, imm64);
1980 }
1981 }
1982
1983 void MacroAssembler::mov64(Register dst, int64_t imm64, relocInfo::relocType rtype, int format) {
1984 Assembler::mov64(dst, imm64, rtype, format);
1985 }
1986
1987 void MacroAssembler::movptr(Register dst, Register src) {
1988 movq(dst, src);
1989 }
1990
1991 void MacroAssembler::movptr(Register dst, Address src) {
1992 movq(dst, src);
1993 }
1994
1995 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
1996 void MacroAssembler::movptr(Register dst, intptr_t src) {
1997 mov64(dst, src);
1998 }
1999
2000 void MacroAssembler::movptr(Address dst, Register src) {
2001 movq(dst, src);
2002 }
2003
2004 void MacroAssembler::movptr(Address dst, int32_t src) {
2005 movslq(dst, src);
2006 }
2007
2008 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2009 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2010 Assembler::movdqu(dst, src);
2011 }
2012
2013 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2014 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2015 Assembler::movdqu(dst, src);
2016 }
2017
2018 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2019 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2020 Assembler::movdqu(dst, src);
2021 }
2022
2023 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2024 assert(rscratch != noreg || always_reachable(src), "missing");
2025
2026 if (reachable(src)) {
2027 movdqu(dst, as_Address(src));
2028 } else {
2029 lea(rscratch, src);
2030 movdqu(dst, Address(rscratch, 0));
2031 }
2032 }
2033
2034 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2035 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2036 Assembler::vmovdqu(dst, src);
2037 }
2038
2039 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2040 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2041 Assembler::vmovdqu(dst, src);
2042 }
2043
2044 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2045 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2046 Assembler::vmovdqu(dst, src);
2047 }
2048
2049 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2050 assert(rscratch != noreg || always_reachable(src), "missing");
2051
2052 if (reachable(src)) {
2053 vmovdqu(dst, as_Address(src));
2054 }
2055 else {
2056 lea(rscratch, src);
2057 vmovdqu(dst, Address(rscratch, 0));
2058 }
2059 }
2060
2061 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2062 assert(rscratch != noreg || always_reachable(src), "missing");
2063
2064 if (vector_len == AVX_512bit) {
2065 evmovdquq(dst, src, AVX_512bit, rscratch);
2066 } else if (vector_len == AVX_256bit) {
2067 vmovdqu(dst, src, rscratch);
2068 } else {
2069 movdqu(dst, src, rscratch);
2070 }
2071 }
2072
2073 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
2074 if (vector_len == AVX_512bit) {
2075 evmovdquq(dst, src, AVX_512bit);
2076 } else if (vector_len == AVX_256bit) {
2077 vmovdqu(dst, src);
2078 } else {
2079 movdqu(dst, src);
2080 }
2081 }
2082
2083 void MacroAssembler::vmovdqu(Address dst, XMMRegister src, int vector_len) {
2084 if (vector_len == AVX_512bit) {
2085 evmovdquq(dst, src, AVX_512bit);
2086 } else if (vector_len == AVX_256bit) {
2087 vmovdqu(dst, src);
2088 } else {
2089 movdqu(dst, src);
2090 }
2091 }
2092
2093 void MacroAssembler::vmovdqu(XMMRegister dst, Address src, int vector_len) {
2094 if (vector_len == AVX_512bit) {
2095 evmovdquq(dst, src, AVX_512bit);
2096 } else if (vector_len == AVX_256bit) {
2097 vmovdqu(dst, src);
2098 } else {
2099 movdqu(dst, src);
2100 }
2101 }
2102
2103 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2104 assert(rscratch != noreg || always_reachable(src), "missing");
2105
2106 if (reachable(src)) {
2107 vmovdqa(dst, as_Address(src));
2108 }
2109 else {
2110 lea(rscratch, src);
2111 vmovdqa(dst, Address(rscratch, 0));
2112 }
2113 }
2114
2115 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2116 assert(rscratch != noreg || always_reachable(src), "missing");
2117
2118 if (vector_len == AVX_512bit) {
2119 evmovdqaq(dst, src, AVX_512bit, rscratch);
2120 } else if (vector_len == AVX_256bit) {
2121 vmovdqa(dst, src, rscratch);
2122 } else {
2123 movdqa(dst, src, rscratch);
2124 }
2125 }
2126
2127 void MacroAssembler::kmov(KRegister dst, Address src) {
2128 if (VM_Version::supports_avx512bw()) {
2129 kmovql(dst, src);
2130 } else {
2131 assert(VM_Version::supports_evex(), "");
2132 kmovwl(dst, src);
2133 }
2134 }
2135
2136 void MacroAssembler::kmov(Address dst, KRegister src) {
2137 if (VM_Version::supports_avx512bw()) {
2138 kmovql(dst, src);
2139 } else {
2140 assert(VM_Version::supports_evex(), "");
2141 kmovwl(dst, src);
2142 }
2143 }
2144
2145 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2146 if (VM_Version::supports_avx512bw()) {
2147 kmovql(dst, src);
2148 } else {
2149 assert(VM_Version::supports_evex(), "");
2150 kmovwl(dst, src);
2151 }
2152 }
2153
2154 void MacroAssembler::kmov(Register dst, KRegister src) {
2155 if (VM_Version::supports_avx512bw()) {
2156 kmovql(dst, src);
2157 } else {
2158 assert(VM_Version::supports_evex(), "");
2159 kmovwl(dst, src);
2160 }
2161 }
2162
2163 void MacroAssembler::kmov(KRegister dst, Register src) {
2164 if (VM_Version::supports_avx512bw()) {
2165 kmovql(dst, src);
2166 } else {
2167 assert(VM_Version::supports_evex(), "");
2168 kmovwl(dst, src);
2169 }
2170 }
2171
2172 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2173 assert(rscratch != noreg || always_reachable(src), "missing");
2174
2175 if (reachable(src)) {
2176 kmovql(dst, as_Address(src));
2177 } else {
2178 lea(rscratch, src);
2179 kmovql(dst, Address(rscratch, 0));
2180 }
2181 }
2182
2183 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2184 assert(rscratch != noreg || always_reachable(src), "missing");
2185
2186 if (reachable(src)) {
2187 kmovwl(dst, as_Address(src));
2188 } else {
2189 lea(rscratch, src);
2190 kmovwl(dst, Address(rscratch, 0));
2191 }
2192 }
2193
2194 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2195 int vector_len, Register rscratch) {
2196 assert(rscratch != noreg || always_reachable(src), "missing");
2197
2198 if (reachable(src)) {
2199 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2200 } else {
2201 lea(rscratch, src);
2202 Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2203 }
2204 }
2205
2206 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2207 int vector_len, Register rscratch) {
2208 assert(rscratch != noreg || always_reachable(src), "missing");
2209
2210 if (reachable(src)) {
2211 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2212 } else {
2213 lea(rscratch, src);
2214 Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2215 }
2216 }
2217
2218 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2219 assert(rscratch != noreg || always_reachable(src), "missing");
2220
2221 if (reachable(src)) {
2222 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2223 } else {
2224 lea(rscratch, src);
2225 Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2226 }
2227 }
2228
2229 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2230 assert(rscratch != noreg || always_reachable(src), "missing");
2231
2232 if (reachable(src)) {
2233 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2234 } else {
2235 lea(rscratch, src);
2236 Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2237 }
2238 }
2239
2240 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2241 assert(rscratch != noreg || always_reachable(src), "missing");
2242
2243 if (reachable(src)) {
2244 Assembler::evmovdquq(dst, as_Address(src), vector_len);
2245 } else {
2246 lea(rscratch, src);
2247 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2248 }
2249 }
2250
2251 void MacroAssembler::evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2252 assert(rscratch != noreg || always_reachable(src), "missing");
2253
2254 if (reachable(src)) {
2255 Assembler::evmovdqaq(dst, mask, as_Address(src), merge, vector_len);
2256 } else {
2257 lea(rscratch, src);
2258 Assembler::evmovdqaq(dst, mask, Address(rscratch, 0), merge, vector_len);
2259 }
2260 }
2261
2262 void MacroAssembler::evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2263 assert(rscratch != noreg || always_reachable(src), "missing");
2264
2265 if (reachable(src)) {
2266 Assembler::evmovdqaq(dst, as_Address(src), vector_len);
2267 } else {
2268 lea(rscratch, src);
2269 Assembler::evmovdqaq(dst, Address(rscratch, 0), vector_len);
2270 }
2271 }
2272
2273 void MacroAssembler::movapd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2274 assert(rscratch != noreg || always_reachable(src), "missing");
2275
2276 if (reachable(src)) {
2277 Assembler::movapd(dst, as_Address(src));
2278 } else {
2279 lea(rscratch, src);
2280 Assembler::movapd(dst, Address(rscratch, 0));
2281 }
2282 }
2283
2284 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2285 assert(rscratch != noreg || always_reachable(src), "missing");
2286
2287 if (reachable(src)) {
2288 Assembler::movdqa(dst, as_Address(src));
2289 } else {
2290 lea(rscratch, src);
2291 Assembler::movdqa(dst, Address(rscratch, 0));
2292 }
2293 }
2294
2295 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2296 assert(rscratch != noreg || always_reachable(src), "missing");
2297
2298 if (reachable(src)) {
2299 Assembler::movsd(dst, as_Address(src));
2300 } else {
2301 lea(rscratch, src);
2302 Assembler::movsd(dst, Address(rscratch, 0));
2303 }
2304 }
2305
2306 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2307 assert(rscratch != noreg || always_reachable(src), "missing");
2308
2309 if (reachable(src)) {
2310 Assembler::movss(dst, as_Address(src));
2311 } else {
2312 lea(rscratch, src);
2313 Assembler::movss(dst, Address(rscratch, 0));
2314 }
2315 }
2316
2317 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2318 assert(rscratch != noreg || always_reachable(src), "missing");
2319
2320 if (reachable(src)) {
2321 Assembler::movddup(dst, as_Address(src));
2322 } else {
2323 lea(rscratch, src);
2324 Assembler::movddup(dst, Address(rscratch, 0));
2325 }
2326 }
2327
2328 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2329 assert(rscratch != noreg || always_reachable(src), "missing");
2330
2331 if (reachable(src)) {
2332 Assembler::vmovddup(dst, as_Address(src), vector_len);
2333 } else {
2334 lea(rscratch, src);
2335 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2336 }
2337 }
2338
2339 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2340 assert(rscratch != noreg || always_reachable(src), "missing");
2341
2342 if (reachable(src)) {
2343 Assembler::mulsd(dst, as_Address(src));
2344 } else {
2345 lea(rscratch, src);
2346 Assembler::mulsd(dst, Address(rscratch, 0));
2347 }
2348 }
2349
2350 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2351 assert(rscratch != noreg || always_reachable(src), "missing");
2352
2353 if (reachable(src)) {
2354 Assembler::mulss(dst, as_Address(src));
2355 } else {
2356 lea(rscratch, src);
2357 Assembler::mulss(dst, Address(rscratch, 0));
2358 }
2359 }
2360
2361 void MacroAssembler::null_check(Register reg, int offset) {
2362 if (needs_explicit_null_check(offset)) {
2363 // provoke OS null exception if reg is null by
2364 // accessing M[reg] w/o changing any (non-CC) registers
2365 // NOTE: cmpl is plenty here to provoke a segv
2366 cmpptr(rax, Address(reg, 0));
2367 // Note: should probably use testl(rax, Address(reg, 0));
2368 // may be shorter code (however, this version of
2369 // testl needs to be implemented first)
2370 } else {
2371 // nothing to do, (later) access of M[reg + offset]
2372 // will provoke OS null exception if reg is null
2373 }
2374 }
2375
2376 void MacroAssembler::os_breakpoint() {
2377 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2378 // (e.g., MSVC can't call ps() otherwise)
2379 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2380 }
2381
2382 void MacroAssembler::unimplemented(const char* what) {
2383 const char* buf = nullptr;
2384 {
2385 ResourceMark rm;
2386 stringStream ss;
2387 ss.print("unimplemented: %s", what);
2388 buf = code_string(ss.as_string());
2389 }
2390 stop(buf);
2391 }
2392
2393 #define XSTATE_BV 0x200
2394
2395 void MacroAssembler::pop_CPU_state() {
2396 pop_FPU_state();
2397 pop_IU_state();
2398 }
2399
2400 void MacroAssembler::pop_FPU_state() {
2401 fxrstor(Address(rsp, 0));
2402 addptr(rsp, FPUStateSizeInWords * wordSize);
2403 }
2404
2405 void MacroAssembler::pop_IU_state() {
2406 popa();
2407 addq(rsp, 8);
2408 popf();
2409 }
2410
2411 // Save Integer and Float state
2412 // Warning: Stack must be 16 byte aligned (64bit)
2413 void MacroAssembler::push_CPU_state() {
2414 push_IU_state();
2415 push_FPU_state();
2416 }
2417
2418 void MacroAssembler::push_FPU_state() {
2419 subptr(rsp, FPUStateSizeInWords * wordSize);
2420 fxsave(Address(rsp, 0));
2421 }
2422
2423 void MacroAssembler::push_IU_state() {
2424 // Push flags first because pusha kills them
2425 pushf();
2426 // Make sure rsp stays 16-byte aligned
2427 subq(rsp, 8);
2428 pusha();
2429 }
2430
2431 void MacroAssembler::push_cont_fastpath() {
2432 if (!Continuations::enabled()) return;
2433
2434 Label L_done;
2435 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2436 jccb(Assembler::belowEqual, L_done);
2437 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rsp);
2438 bind(L_done);
2439 }
2440
2441 void MacroAssembler::pop_cont_fastpath() {
2442 if (!Continuations::enabled()) return;
2443
2444 Label L_done;
2445 cmpptr(rsp, Address(r15_thread, JavaThread::cont_fastpath_offset()));
2446 jccb(Assembler::below, L_done);
2447 movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
2448 bind(L_done);
2449 }
2450
2451 #ifdef ASSERT
2452 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
2453 Label no_cont;
2454 movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
2455 testl(cont, cont);
2456 jcc(Assembler::zero, no_cont);
2457 stop(name);
2458 bind(no_cont);
2459 }
2460 #endif
2461
2462 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { // determine java_thread register
2463 // we must set sp to zero to clear frame
2464 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2465 // must clear fp, so that compiled frames are not confused; it is
2466 // possible that we need it only for debugging
2467 if (clear_fp) {
2468 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2469 }
2470 // Always clear the pc because it could have been set by make_walkable()
2471 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2472 vzeroupper();
2473 }
2474
2475 void MacroAssembler::round_to(Register reg, int modulus) {
2476 addptr(reg, modulus - 1);
2477 andptr(reg, -modulus);
2478 }
2479
2480 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod) {
2481 if (at_return) {
2482 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2483 // we may safely use rsp instead to perform the stack watermark check.
2484 cmpptr(in_nmethod ? rsp : rbp, Address(r15_thread, JavaThread::polling_word_offset()));
2485 jcc(Assembler::above, slow_path);
2486 return;
2487 }
2488 testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2489 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2490 }
2491
2492 // Calls to C land
2493 //
2494 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2495 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2496 // has to be reset to 0. This is required to allow proper stack traversal.
2497 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2498 Register last_java_fp,
2499 address last_java_pc,
2500 Register rscratch) {
2501 vzeroupper();
2502 // determine last_java_sp register
2503 if (!last_java_sp->is_valid()) {
2504 last_java_sp = rsp;
2505 }
2506 // last_java_fp is optional
2507 if (last_java_fp->is_valid()) {
2508 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2509 }
2510 // last_java_pc is optional
2511 if (last_java_pc != nullptr) {
2512 Address java_pc(r15_thread,
2513 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
2514 lea(java_pc, InternalAddress(last_java_pc), rscratch);
2515 }
2516 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2517 }
2518
2519 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
2520 Register last_java_fp,
2521 Label &L,
2522 Register scratch) {
2523 lea(scratch, L);
2524 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), scratch);
2525 set_last_Java_frame(last_java_sp, last_java_fp, nullptr, scratch);
2526 }
2527
2528 void MacroAssembler::shlptr(Register dst, int imm8) {
2529 shlq(dst, imm8);
2530 }
2531
2532 void MacroAssembler::shrptr(Register dst, int imm8) {
2533 shrq(dst, imm8);
2534 }
2535
2536 void MacroAssembler::sign_extend_byte(Register reg) {
2537 movsbl(reg, reg); // movsxb
2538 }
2539
2540 void MacroAssembler::sign_extend_short(Register reg) {
2541 movswl(reg, reg); // movsxw
2542 }
2543
2544 void MacroAssembler::narrow_subword_type(Register reg, BasicType bt) {
2545 assert(is_subword_type(bt), "required");
2546 switch (bt) {
2547 case T_BOOLEAN: andl(reg, 1); break;
2548 case T_BYTE: movsbl(reg, reg); break;
2549 case T_CHAR: movzwl(reg, reg); break;
2550 case T_SHORT: movswl(reg, reg); break;
2551 default: ShouldNotReachHere();
2552 }
2553 }
2554
2555 void MacroAssembler::testl(Address dst, int32_t imm32) {
2556 if (imm32 >= 0 && is8bit(imm32)) {
2557 testb(dst, imm32);
2558 } else {
2559 Assembler::testl(dst, imm32);
2560 }
2561 }
2562
2563 void MacroAssembler::testl(Register dst, int32_t imm32) {
2564 if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
2565 testb(dst, imm32);
2566 } else {
2567 Assembler::testl(dst, imm32);
2568 }
2569 }
2570
2571 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2572 assert(always_reachable(src), "Address should be reachable");
2573 testl(dst, as_Address(src));
2574 }
2575
2576 void MacroAssembler::testq(Address dst, int32_t imm32) {
2577 if (imm32 >= 0) {
2578 testl(dst, imm32);
2579 } else {
2580 Assembler::testq(dst, imm32);
2581 }
2582 }
2583
2584 void MacroAssembler::testq(Register dst, int32_t imm32) {
2585 if (imm32 >= 0) {
2586 testl(dst, imm32);
2587 } else {
2588 Assembler::testq(dst, imm32);
2589 }
2590 }
2591
2592 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2593 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2594 Assembler::pcmpeqb(dst, src);
2595 }
2596
2597 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2598 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2599 Assembler::pcmpeqw(dst, src);
2600 }
2601
2602 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2603 assert((dst->encoding() < 16),"XMM register should be 0-15");
2604 Assembler::pcmpestri(dst, src, imm8);
2605 }
2606
2607 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2608 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2609 Assembler::pcmpestri(dst, src, imm8);
2610 }
2611
2612 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2613 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2614 Assembler::pmovzxbw(dst, src);
2615 }
2616
2617 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2618 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2619 Assembler::pmovzxbw(dst, src);
2620 }
2621
2622 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2623 assert((src->encoding() < 16),"XMM register should be 0-15");
2624 Assembler::pmovmskb(dst, src);
2625 }
2626
2627 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2628 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2629 Assembler::ptest(dst, src);
2630 }
2631
2632 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2633 assert(rscratch != noreg || always_reachable(src), "missing");
2634
2635 if (reachable(src)) {
2636 Assembler::sqrtss(dst, as_Address(src));
2637 } else {
2638 lea(rscratch, src);
2639 Assembler::sqrtss(dst, Address(rscratch, 0));
2640 }
2641 }
2642
2643 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2644 assert(rscratch != noreg || always_reachable(src), "missing");
2645
2646 if (reachable(src)) {
2647 Assembler::subsd(dst, as_Address(src));
2648 } else {
2649 lea(rscratch, src);
2650 Assembler::subsd(dst, Address(rscratch, 0));
2651 }
2652 }
2653
2654 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
2655 assert(rscratch != noreg || always_reachable(src), "missing");
2656
2657 if (reachable(src)) {
2658 Assembler::roundsd(dst, as_Address(src), rmode);
2659 } else {
2660 lea(rscratch, src);
2661 Assembler::roundsd(dst, Address(rscratch, 0), rmode);
2662 }
2663 }
2664
2665 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2666 assert(rscratch != noreg || always_reachable(src), "missing");
2667
2668 if (reachable(src)) {
2669 Assembler::subss(dst, as_Address(src));
2670 } else {
2671 lea(rscratch, src);
2672 Assembler::subss(dst, Address(rscratch, 0));
2673 }
2674 }
2675
2676 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2677 assert(rscratch != noreg || always_reachable(src), "missing");
2678
2679 if (reachable(src)) {
2680 Assembler::ucomisd(dst, as_Address(src));
2681 } else {
2682 lea(rscratch, src);
2683 Assembler::ucomisd(dst, Address(rscratch, 0));
2684 }
2685 }
2686
2687 void MacroAssembler::evucomxsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2688 assert(rscratch != noreg || always_reachable(src), "missing");
2689
2690 if (reachable(src)) {
2691 Assembler::evucomxsd(dst, as_Address(src));
2692 } else {
2693 lea(rscratch, src);
2694 Assembler::evucomxsd(dst, Address(rscratch, 0));
2695 }
2696 }
2697
2698 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2699 assert(rscratch != noreg || always_reachable(src), "missing");
2700
2701 if (reachable(src)) {
2702 Assembler::ucomiss(dst, as_Address(src));
2703 } else {
2704 lea(rscratch, src);
2705 Assembler::ucomiss(dst, Address(rscratch, 0));
2706 }
2707 }
2708
2709 void MacroAssembler::evucomxss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2710 assert(rscratch != noreg || always_reachable(src), "missing");
2711
2712 if (reachable(src)) {
2713 Assembler::evucomxss(dst, as_Address(src));
2714 } else {
2715 lea(rscratch, src);
2716 Assembler::evucomxss(dst, Address(rscratch, 0));
2717 }
2718 }
2719
2720 void MacroAssembler::evucomish(XMMRegister dst, AddressLiteral src, Register rscratch) {
2721 assert(rscratch != noreg || always_reachable(src), "missing");
2722
2723 if (reachable(src)) {
2724 Assembler::evucomish(dst, as_Address(src));
2725 } else {
2726 lea(rscratch, src);
2727 Assembler::evucomish(dst, Address(rscratch, 0));
2728 }
2729 }
2730
2731 void MacroAssembler::evucomxsh(XMMRegister dst, AddressLiteral src, Register rscratch) {
2732 assert(rscratch != noreg || always_reachable(src), "missing");
2733
2734 if (reachable(src)) {
2735 Assembler::evucomxsh(dst, as_Address(src));
2736 } else {
2737 lea(rscratch, src);
2738 Assembler::evucomxsh(dst, Address(rscratch, 0));
2739 }
2740 }
2741
2742 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2743 assert(rscratch != noreg || always_reachable(src), "missing");
2744
2745 // Used in sign-bit flipping with aligned address.
2746 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2747
2748 if (UseAVX > 2 &&
2749 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2750 (dst->encoding() >= 16)) {
2751 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2752 } else if (reachable(src)) {
2753 Assembler::xorpd(dst, as_Address(src));
2754 } else {
2755 lea(rscratch, src);
2756 Assembler::xorpd(dst, Address(rscratch, 0));
2757 }
2758 }
2759
2760 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
2761 if (UseAVX > 2 &&
2762 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2763 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2764 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2765 } else {
2766 Assembler::xorpd(dst, src);
2767 }
2768 }
2769
2770 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
2771 if (UseAVX > 2 &&
2772 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2773 ((dst->encoding() >= 16) || (src->encoding() >= 16))) {
2774 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2775 } else {
2776 Assembler::xorps(dst, src);
2777 }
2778 }
2779
2780 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
2781 assert(rscratch != noreg || always_reachable(src), "missing");
2782
2783 // Used in sign-bit flipping with aligned address.
2784 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2785
2786 if (UseAVX > 2 &&
2787 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) &&
2788 (dst->encoding() >= 16)) {
2789 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch);
2790 } else if (reachable(src)) {
2791 Assembler::xorps(dst, as_Address(src));
2792 } else {
2793 lea(rscratch, src);
2794 Assembler::xorps(dst, Address(rscratch, 0));
2795 }
2796 }
2797
2798 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
2799 assert(rscratch != noreg || always_reachable(src), "missing");
2800
2801 // Used in sign-bit flipping with aligned address.
2802 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
2803 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
2804 if (reachable(src)) {
2805 Assembler::pshufb(dst, as_Address(src));
2806 } else {
2807 lea(rscratch, src);
2808 Assembler::pshufb(dst, Address(rscratch, 0));
2809 }
2810 }
2811
2812 // AVX 3-operands instructions
2813
2814 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2815 assert(rscratch != noreg || always_reachable(src), "missing");
2816
2817 if (reachable(src)) {
2818 vaddsd(dst, nds, as_Address(src));
2819 } else {
2820 lea(rscratch, src);
2821 vaddsd(dst, nds, Address(rscratch, 0));
2822 }
2823 }
2824
2825 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
2826 assert(rscratch != noreg || always_reachable(src), "missing");
2827
2828 if (reachable(src)) {
2829 vaddss(dst, nds, as_Address(src));
2830 } else {
2831 lea(rscratch, src);
2832 vaddss(dst, nds, Address(rscratch, 0));
2833 }
2834 }
2835
2836 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2837 assert(UseAVX > 0, "requires some form of AVX");
2838 assert(rscratch != noreg || always_reachable(src), "missing");
2839
2840 if (reachable(src)) {
2841 Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
2842 } else {
2843 lea(rscratch, src);
2844 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
2845 }
2846 }
2847
2848 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2849 assert(UseAVX > 0, "requires some form of AVX");
2850 assert(rscratch != noreg || always_reachable(src), "missing");
2851
2852 if (reachable(src)) {
2853 Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
2854 } else {
2855 lea(rscratch, src);
2856 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
2857 }
2858 }
2859
2860 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2861 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2862 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2863
2864 vandps(dst, nds, negate_field, vector_len, rscratch);
2865 }
2866
2867 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
2868 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2869 assert(rscratch != noreg || always_reachable(negate_field), "missing");
2870
2871 vandpd(dst, nds, negate_field, vector_len, rscratch);
2872 }
2873
2874 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2875 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2876 Assembler::vpaddb(dst, nds, src, vector_len);
2877 }
2878
2879 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2880 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2881 Assembler::vpaddb(dst, nds, src, vector_len);
2882 }
2883
2884 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2885 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2886 Assembler::vpaddw(dst, nds, src, vector_len);
2887 }
2888
2889 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2890 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2891 Assembler::vpaddw(dst, nds, src, vector_len);
2892 }
2893
2894 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2895 assert(rscratch != noreg || always_reachable(src), "missing");
2896
2897 if (reachable(src)) {
2898 Assembler::vpand(dst, nds, as_Address(src), vector_len);
2899 } else {
2900 lea(rscratch, src);
2901 Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
2902 }
2903 }
2904
2905 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2906 assert(rscratch != noreg || always_reachable(src), "missing");
2907
2908 if (reachable(src)) {
2909 Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
2910 } else {
2911 lea(rscratch, src);
2912 Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
2913 }
2914 }
2915
2916 void MacroAssembler::vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2917 assert(rscratch != noreg || always_reachable(src), "missing");
2918
2919 if (reachable(src)) {
2920 Assembler::vbroadcasti128(dst, as_Address(src), vector_len);
2921 } else {
2922 lea(rscratch, src);
2923 Assembler::vbroadcasti128(dst, Address(rscratch, 0), vector_len);
2924 }
2925 }
2926
2927 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2928 assert(rscratch != noreg || always_reachable(src), "missing");
2929
2930 if (reachable(src)) {
2931 Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
2932 } else {
2933 lea(rscratch, src);
2934 Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
2935 }
2936 }
2937
2938 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2939 assert(rscratch != noreg || always_reachable(src), "missing");
2940
2941 if (reachable(src)) {
2942 Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
2943 } else {
2944 lea(rscratch, src);
2945 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
2946 }
2947 }
2948
2949 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2950 assert(rscratch != noreg || always_reachable(src), "missing");
2951
2952 if (reachable(src)) {
2953 Assembler::vbroadcastss(dst, as_Address(src), vector_len);
2954 } else {
2955 lea(rscratch, src);
2956 Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
2957 }
2958 }
2959
2960 // Vector float blend
2961 // vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
2962 void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
2963 // WARN: Allow dst == (src1|src2), mask == scratch
2964 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
2965 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
2966 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst;
2967 bool dst_available = dst != mask && (dst != src1 || dst != src2);
2968 if (blend_emulation && scratch_available && dst_available) {
2969 if (compute_mask) {
2970 vpsrad(scratch, mask, 32, vector_len);
2971 mask = scratch;
2972 }
2973 if (dst == src1) {
2974 vpandn(dst, mask, src1, vector_len); // if mask == 0, src1
2975 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
2976 } else {
2977 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
2978 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1
2979 }
2980 vpor(dst, dst, scratch, vector_len);
2981 } else {
2982 Assembler::vblendvps(dst, src1, src2, mask, vector_len);
2983 }
2984 }
2985
2986 // vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
2987 void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
2988 // WARN: Allow dst == (src1|src2), mask == scratch
2989 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1 &&
2990 !(VM_Version::is_intel_darkmont() && (dst == src1)); // partially fixed on Darkmont
2991 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask);
2992 bool dst_available = dst != mask && (dst != src1 || dst != src2);
2993 if (blend_emulation && scratch_available && dst_available) {
2994 if (compute_mask) {
2995 vpxor(scratch, scratch, scratch, vector_len);
2996 vpcmpgtq(scratch, scratch, mask, vector_len);
2997 mask = scratch;
2998 }
2999 if (dst == src1) {
3000 vpandn(dst, mask, src1, vector_len); // if mask == 0, src
3001 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3002 } else {
3003 vpand (dst, mask, src2, vector_len); // if mask == 1, src2
3004 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src
3005 }
3006 vpor(dst, dst, scratch, vector_len);
3007 } else {
3008 Assembler::vblendvpd(dst, src1, src2, mask, vector_len);
3009 }
3010 }
3011
3012 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3013 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3014 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3015 }
3016
3017 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
3018 assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3019 Assembler::vpcmpeqb(dst, src1, src2, vector_len);
3020 }
3021
3022 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3023 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3024 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3025 }
3026
3027 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3028 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3029 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3030 }
3031
3032 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3033 assert(rscratch != noreg || always_reachable(src), "missing");
3034
3035 if (reachable(src)) {
3036 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3037 } else {
3038 lea(rscratch, src);
3039 Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3040 }
3041 }
3042
3043 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3044 int comparison, bool is_signed, int vector_len, Register rscratch) {
3045 assert(rscratch != noreg || always_reachable(src), "missing");
3046
3047 if (reachable(src)) {
3048 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3049 } else {
3050 lea(rscratch, src);
3051 Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3052 }
3053 }
3054
3055 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3056 int comparison, bool is_signed, int vector_len, Register rscratch) {
3057 assert(rscratch != noreg || always_reachable(src), "missing");
3058
3059 if (reachable(src)) {
3060 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3061 } else {
3062 lea(rscratch, src);
3063 Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3064 }
3065 }
3066
3067 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3068 int comparison, bool is_signed, int vector_len, Register rscratch) {
3069 assert(rscratch != noreg || always_reachable(src), "missing");
3070
3071 if (reachable(src)) {
3072 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3073 } else {
3074 lea(rscratch, src);
3075 Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3076 }
3077 }
3078
3079 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3080 int comparison, bool is_signed, int vector_len, Register rscratch) {
3081 assert(rscratch != noreg || always_reachable(src), "missing");
3082
3083 if (reachable(src)) {
3084 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3085 } else {
3086 lea(rscratch, src);
3087 Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3088 }
3089 }
3090
3091 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3092 if (width == Assembler::Q) {
3093 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3094 } else {
3095 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3096 }
3097 }
3098
3099 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3100 int eq_cond_enc = 0x29;
3101 int gt_cond_enc = 0x37;
3102 if (width != Assembler::Q) {
3103 eq_cond_enc = 0x74 + width;
3104 gt_cond_enc = 0x64 + width;
3105 }
3106 switch (cond) {
3107 case eq:
3108 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3109 break;
3110 case neq:
3111 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3112 vallones(xtmp, vector_len);
3113 vpxor(dst, xtmp, dst, vector_len);
3114 break;
3115 case le:
3116 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3117 vallones(xtmp, vector_len);
3118 vpxor(dst, xtmp, dst, vector_len);
3119 break;
3120 case nlt:
3121 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3122 vallones(xtmp, vector_len);
3123 vpxor(dst, xtmp, dst, vector_len);
3124 break;
3125 case lt:
3126 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3127 break;
3128 case nle:
3129 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3130 break;
3131 default:
3132 assert(false, "Should not reach here");
3133 }
3134 }
3135
3136 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3137 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3138 Assembler::vpmovzxbw(dst, src, vector_len);
3139 }
3140
3141 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3142 assert((src->encoding() < 16),"XMM register should be 0-15");
3143 Assembler::vpmovmskb(dst, src, vector_len);
3144 }
3145
3146 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3147 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3148 Assembler::vpmullw(dst, nds, src, vector_len);
3149 }
3150
3151 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3152 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3153 Assembler::vpmullw(dst, nds, src, vector_len);
3154 }
3155
3156 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3157 assert((UseAVX > 0), "AVX support is needed");
3158 assert(rscratch != noreg || always_reachable(src), "missing");
3159
3160 if (reachable(src)) {
3161 Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3162 } else {
3163 lea(rscratch, src);
3164 Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3165 }
3166 }
3167
3168 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3169 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3170 Assembler::vpsubb(dst, nds, src, vector_len);
3171 }
3172
3173 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3174 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3175 Assembler::vpsubb(dst, nds, src, vector_len);
3176 }
3177
3178 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3179 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3180 Assembler::vpsubw(dst, nds, src, vector_len);
3181 }
3182
3183 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3184 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3185 Assembler::vpsubw(dst, nds, src, vector_len);
3186 }
3187
3188 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3189 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3190 Assembler::vpsraw(dst, nds, shift, vector_len);
3191 }
3192
3193 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3194 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3195 Assembler::vpsraw(dst, nds, shift, vector_len);
3196 }
3197
3198 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3199 assert(UseAVX > 2,"");
3200 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3201 vector_len = 2;
3202 }
3203 Assembler::evpsraq(dst, nds, shift, vector_len);
3204 }
3205
3206 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3207 assert(UseAVX > 2,"");
3208 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3209 vector_len = 2;
3210 }
3211 Assembler::evpsraq(dst, nds, shift, vector_len);
3212 }
3213
3214 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3215 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3216 Assembler::vpsrlw(dst, nds, shift, vector_len);
3217 }
3218
3219 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3220 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3221 Assembler::vpsrlw(dst, nds, shift, vector_len);
3222 }
3223
3224 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3225 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3226 Assembler::vpsllw(dst, nds, shift, vector_len);
3227 }
3228
3229 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3230 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3231 Assembler::vpsllw(dst, nds, shift, vector_len);
3232 }
3233
3234 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3235 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3236 Assembler::vptest(dst, src);
3237 }
3238
3239 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3240 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3241 Assembler::punpcklbw(dst, src);
3242 }
3243
3244 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3245 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3246 Assembler::pshufd(dst, src, mode);
3247 }
3248
3249 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3250 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3251 Assembler::pshuflw(dst, src, mode);
3252 }
3253
3254 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3255 assert(rscratch != noreg || always_reachable(src), "missing");
3256
3257 if (reachable(src)) {
3258 vandpd(dst, nds, as_Address(src), vector_len);
3259 } else {
3260 lea(rscratch, src);
3261 vandpd(dst, nds, Address(rscratch, 0), vector_len);
3262 }
3263 }
3264
3265 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3266 assert(rscratch != noreg || always_reachable(src), "missing");
3267
3268 if (reachable(src)) {
3269 vandps(dst, nds, as_Address(src), vector_len);
3270 } else {
3271 lea(rscratch, src);
3272 vandps(dst, nds, Address(rscratch, 0), vector_len);
3273 }
3274 }
3275
3276 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3277 bool merge, int vector_len, Register rscratch) {
3278 assert(rscratch != noreg || always_reachable(src), "missing");
3279
3280 if (reachable(src)) {
3281 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3282 } else {
3283 lea(rscratch, src);
3284 Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3285 }
3286 }
3287
3288 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3289 assert(rscratch != noreg || always_reachable(src), "missing");
3290
3291 if (reachable(src)) {
3292 vdivsd(dst, nds, as_Address(src));
3293 } else {
3294 lea(rscratch, src);
3295 vdivsd(dst, nds, Address(rscratch, 0));
3296 }
3297 }
3298
3299 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3300 assert(rscratch != noreg || always_reachable(src), "missing");
3301
3302 if (reachable(src)) {
3303 vdivss(dst, nds, as_Address(src));
3304 } else {
3305 lea(rscratch, src);
3306 vdivss(dst, nds, Address(rscratch, 0));
3307 }
3308 }
3309
3310 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3311 assert(rscratch != noreg || always_reachable(src), "missing");
3312
3313 if (reachable(src)) {
3314 vmulsd(dst, nds, as_Address(src));
3315 } else {
3316 lea(rscratch, src);
3317 vmulsd(dst, nds, Address(rscratch, 0));
3318 }
3319 }
3320
3321 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3322 assert(rscratch != noreg || always_reachable(src), "missing");
3323
3324 if (reachable(src)) {
3325 vmulss(dst, nds, as_Address(src));
3326 } else {
3327 lea(rscratch, src);
3328 vmulss(dst, nds, Address(rscratch, 0));
3329 }
3330 }
3331
3332 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3333 assert(rscratch != noreg || always_reachable(src), "missing");
3334
3335 if (reachable(src)) {
3336 vsubsd(dst, nds, as_Address(src));
3337 } else {
3338 lea(rscratch, src);
3339 vsubsd(dst, nds, Address(rscratch, 0));
3340 }
3341 }
3342
3343 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3344 assert(rscratch != noreg || always_reachable(src), "missing");
3345
3346 if (reachable(src)) {
3347 vsubss(dst, nds, as_Address(src));
3348 } else {
3349 lea(rscratch, src);
3350 vsubss(dst, nds, Address(rscratch, 0));
3351 }
3352 }
3353
3354 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3355 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3356 assert(rscratch != noreg || always_reachable(src), "missing");
3357
3358 vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3359 }
3360
3361 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3362 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3363 assert(rscratch != noreg || always_reachable(src), "missing");
3364
3365 vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3366 }
3367
3368 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3369 assert(rscratch != noreg || always_reachable(src), "missing");
3370
3371 if (reachable(src)) {
3372 vxorpd(dst, nds, as_Address(src), vector_len);
3373 } else {
3374 lea(rscratch, src);
3375 vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3376 }
3377 }
3378
3379 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3380 assert(rscratch != noreg || always_reachable(src), "missing");
3381
3382 if (reachable(src)) {
3383 vxorps(dst, nds, as_Address(src), vector_len);
3384 } else {
3385 lea(rscratch, src);
3386 vxorps(dst, nds, Address(rscratch, 0), vector_len);
3387 }
3388 }
3389
3390 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3391 assert(rscratch != noreg || always_reachable(src), "missing");
3392
3393 if (UseAVX > 1 || (vector_len < 1)) {
3394 if (reachable(src)) {
3395 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3396 } else {
3397 lea(rscratch, src);
3398 Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3399 }
3400 } else {
3401 MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3402 }
3403 }
3404
3405 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3406 assert(rscratch != noreg || always_reachable(src), "missing");
3407
3408 if (reachable(src)) {
3409 Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3410 } else {
3411 lea(rscratch, src);
3412 Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3413 }
3414 }
3415
3416 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3417 const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3418 STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3419 // The inverted mask is sign-extended
3420 andptr(possibly_non_local, inverted_mask);
3421 }
3422
3423 void MacroAssembler::resolve_jobject(Register value,
3424 Register tmp) {
3425 Register thread = r15_thread;
3426 assert_different_registers(value, thread, tmp);
3427 Label done, tagged, weak_tagged;
3428 testptr(value, value);
3429 jcc(Assembler::zero, done); // Use null as-is.
3430 testptr(value, JNIHandles::tag_mask); // Test for tag.
3431 jcc(Assembler::notZero, tagged);
3432
3433 // Resolve local handle
3434 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp);
3435 verify_oop(value);
3436 jmp(done);
3437
3438 bind(tagged);
3439 testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3440 jcc(Assembler::notZero, weak_tagged);
3441
3442 // Resolve global handle
3443 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3444 verify_oop(value);
3445 jmp(done);
3446
3447 bind(weak_tagged);
3448 // Resolve jweak.
3449 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3450 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp);
3451 verify_oop(value);
3452
3453 bind(done);
3454 }
3455
3456 void MacroAssembler::resolve_global_jobject(Register value,
3457 Register tmp) {
3458 Register thread = r15_thread;
3459 assert_different_registers(value, thread, tmp);
3460 Label done;
3461
3462 testptr(value, value);
3463 jcc(Assembler::zero, done); // Use null as-is.
3464
3465 #ifdef ASSERT
3466 {
3467 Label valid_global_tag;
3468 testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
3469 jcc(Assembler::notZero, valid_global_tag);
3470 stop("non global jobject using resolve_global_jobject");
3471 bind(valid_global_tag);
3472 }
3473 #endif
3474
3475 // Resolve global handle
3476 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp);
3477 verify_oop(value);
3478
3479 bind(done);
3480 }
3481
3482 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3483 subq(dst, imm32);
3484 }
3485
3486 // Force generation of a 4 byte immediate value even if it fits into 8bit
3487 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3488 subq_imm32(dst, imm32);
3489 }
3490
3491 void MacroAssembler::subptr(Register dst, Register src) {
3492 subq(dst, src);
3493 }
3494
3495 // C++ bool manipulation
3496 void MacroAssembler::testbool(Register dst) {
3497 if(sizeof(bool) == 1)
3498 testb(dst, 0xff);
3499 else if(sizeof(bool) == 2) {
3500 // testw implementation needed for two byte bools
3501 ShouldNotReachHere();
3502 } else if(sizeof(bool) == 4)
3503 testl(dst, dst);
3504 else
3505 // unsupported
3506 ShouldNotReachHere();
3507 }
3508
3509 void MacroAssembler::testptr(Register dst, Register src) {
3510 testq(dst, src);
3511 }
3512
3513 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3514 void MacroAssembler::tlab_allocate(Register obj,
3515 Register var_size_in_bytes,
3516 int con_size_in_bytes,
3517 Register t1,
3518 Register t2,
3519 Label& slow_case) {
3520 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3521 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3522 }
3523
3524 RegSet MacroAssembler::call_clobbered_gp_registers() {
3525 RegSet regs;
3526 regs += RegSet::of(rax, rcx, rdx);
3527 #ifndef _WINDOWS
3528 regs += RegSet::of(rsi, rdi);
3529 #endif
3530 regs += RegSet::range(r8, r11);
3531 if (UseAPX) {
3532 regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1));
3533 }
3534 return regs;
3535 }
3536
3537 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
3538 int num_xmm_registers = XMMRegister::available_xmm_registers();
3539 #if defined(_WINDOWS)
3540 XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
3541 if (num_xmm_registers > 16) {
3542 result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
3543 }
3544 return result;
3545 #else
3546 return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
3547 #endif
3548 }
3549
3550 // C1 only ever uses the first double/float of the XMM register.
3551 static int xmm_save_size() { return sizeof(double); }
3552
3553 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3554 masm->movdbl(Address(rsp, offset), reg);
3555 }
3556
3557 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3558 masm->movdbl(reg, Address(rsp, offset));
3559 }
3560
3561 static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers,
3562 bool save_fpu, int& gp_area_size, int& xmm_area_size) {
3563
3564 gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
3565 StackAlignmentInBytes);
3566 xmm_area_size = save_fpu ? xmm_registers.size() * xmm_save_size() : 0;
3567
3568 return gp_area_size + xmm_area_size;
3569 }
3570
3571 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
3572 block_comment("push_call_clobbered_registers start");
3573 // Regular registers
3574 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
3575
3576 int gp_area_size;
3577 int xmm_area_size;
3578 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
3579 gp_area_size, xmm_area_size);
3580 subptr(rsp, total_save_size);
3581
3582 push_set(gp_registers_to_push, 0);
3583
3584 if (save_fpu) {
3585 push_set(call_clobbered_xmm_registers(), gp_area_size);
3586 }
3587
3588 block_comment("push_call_clobbered_registers end");
3589 }
3590
3591 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
3592 block_comment("pop_call_clobbered_registers start");
3593
3594 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
3595
3596 int gp_area_size;
3597 int xmm_area_size;
3598 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
3599 gp_area_size, xmm_area_size);
3600
3601 if (restore_fpu) {
3602 pop_set(call_clobbered_xmm_registers(), gp_area_size);
3603 }
3604
3605 pop_set(gp_registers_to_pop, 0);
3606
3607 addptr(rsp, total_save_size);
3608
3609 vzeroupper();
3610
3611 block_comment("pop_call_clobbered_registers end");
3612 }
3613
3614 void MacroAssembler::push_set(XMMRegSet set, int offset) {
3615 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
3616 int spill_offset = offset;
3617
3618 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
3619 save_xmm_register(this, spill_offset, *it);
3620 spill_offset += xmm_save_size();
3621 }
3622 }
3623
3624 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
3625 int restore_size = set.size() * xmm_save_size();
3626 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
3627
3628 int restore_offset = offset + restore_size - xmm_save_size();
3629
3630 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
3631 restore_xmm_register(this, restore_offset, *it);
3632 restore_offset -= xmm_save_size();
3633 }
3634 }
3635
3636 void MacroAssembler::push_set(RegSet set, int offset) {
3637 int spill_offset;
3638 if (offset == -1) {
3639 int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3640 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
3641 subptr(rsp, aligned_size);
3642 spill_offset = 0;
3643 } else {
3644 spill_offset = offset;
3645 }
3646
3647 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
3648 movptr(Address(rsp, spill_offset), *it);
3649 spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3650 }
3651 }
3652
3653 void MacroAssembler::pop_set(RegSet set, int offset) {
3654
3655 int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
3656 int restore_size = set.size() * gp_reg_size;
3657 int aligned_size = align_up(restore_size, StackAlignmentInBytes);
3658
3659 int restore_offset;
3660 if (offset == -1) {
3661 restore_offset = restore_size - gp_reg_size;
3662 } else {
3663 restore_offset = offset + restore_size - gp_reg_size;
3664 }
3665 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
3666 movptr(*it, Address(rsp, restore_offset));
3667 restore_offset -= gp_reg_size;
3668 }
3669
3670 if (offset == -1) {
3671 addptr(rsp, aligned_size);
3672 }
3673 }
3674
3675 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3676 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3677 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3678 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3679 Label done;
3680
3681 testptr(length_in_bytes, length_in_bytes);
3682 jcc(Assembler::zero, done);
3683
3684 // initialize topmost word, divide index by 2, check if odd and test if zero
3685 // note: for the remaining code to work, index must be a multiple of BytesPerWord
3686 #ifdef ASSERT
3687 {
3688 Label L;
3689 testptr(length_in_bytes, BytesPerWord - 1);
3690 jcc(Assembler::zero, L);
3691 stop("length must be a multiple of BytesPerWord");
3692 bind(L);
3693 }
3694 #endif
3695 Register index = length_in_bytes;
3696 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
3697 if (UseIncDec) {
3698 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
3699 } else {
3700 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
3701 shrptr(index, 1);
3702 }
3703
3704 // initialize remaining object fields: index is a multiple of 2 now
3705 {
3706 Label loop;
3707 bind(loop);
3708 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3709 decrement(index);
3710 jcc(Assembler::notZero, loop);
3711 }
3712
3713 bind(done);
3714 }
3715
3716 // Look up the method for a megamorphic invokeinterface call.
3717 // The target method is determined by <intf_klass, itable_index>.
3718 // The receiver klass is in recv_klass.
3719 // On success, the result will be in method_result, and execution falls through.
3720 // On failure, execution transfers to the given label.
3721 void MacroAssembler::lookup_interface_method(Register recv_klass,
3722 Register intf_klass,
3723 RegisterOrConstant itable_index,
3724 Register method_result,
3725 Register scan_temp,
3726 Label& L_no_such_interface,
3727 bool return_method) {
3728 assert_different_registers(recv_klass, intf_klass, scan_temp);
3729 assert_different_registers(method_result, intf_klass, scan_temp);
3730 assert(recv_klass != method_result || !return_method,
3731 "recv_klass can be destroyed when method isn't needed");
3732
3733 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3734 "caller must use same register for non-constant itable index as for method");
3735
3736 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3737 int vtable_base = in_bytes(Klass::vtable_start_offset());
3738 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3739 int scan_step = itableOffsetEntry::size() * wordSize;
3740 int vte_size = vtableEntry::size_in_bytes();
3741 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3742 assert(vte_size == wordSize, "else adjust times_vte_scale");
3743
3744 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3745
3746 // Could store the aligned, prescaled offset in the klass.
3747 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3748
3749 if (return_method) {
3750 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3751 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3752 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3753 }
3754
3755 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
3756 // if (scan->interface() == intf) {
3757 // result = (klass + scan->offset() + itable_index);
3758 // }
3759 // }
3760 Label search, found_method;
3761
3762 for (int peel = 1; peel >= 0; peel--) {
3763 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
3764 cmpptr(intf_klass, method_result);
3765
3766 if (peel) {
3767 jccb(Assembler::equal, found_method);
3768 } else {
3769 jccb(Assembler::notEqual, search);
3770 // (invert the test to fall through to found_method...)
3771 }
3772
3773 if (!peel) break;
3774
3775 bind(search);
3776
3777 // Check that the previous entry is non-null. A null entry means that
3778 // the receiver class doesn't implement the interface, and wasn't the
3779 // same as when the caller was compiled.
3780 testptr(method_result, method_result);
3781 jcc(Assembler::zero, L_no_such_interface);
3782 addptr(scan_temp, scan_step);
3783 }
3784
3785 bind(found_method);
3786
3787 if (return_method) {
3788 // Got a hit.
3789 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
3790 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3791 }
3792 }
3793
3794 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3795 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3796 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3797 // The target method is determined by <holder_klass, itable_index>.
3798 // The receiver klass is in recv_klass.
3799 // On success, the result will be in method_result, and execution falls through.
3800 // On failure, execution transfers to the given label.
3801 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3802 Register holder_klass,
3803 Register resolved_klass,
3804 Register method_result,
3805 Register scan_temp,
3806 Register temp_reg2,
3807 Register receiver,
3808 int itable_index,
3809 Label& L_no_such_interface) {
3810 assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
3811 Register temp_itbl_klass = method_result;
3812 Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
3813
3814 int vtable_base = in_bytes(Klass::vtable_start_offset());
3815 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3816 int scan_step = itableOffsetEntry::size() * wordSize;
3817 int vte_size = vtableEntry::size_in_bytes();
3818 int ioffset = in_bytes(itableOffsetEntry::interface_offset());
3819 int ooffset = in_bytes(itableOffsetEntry::offset_offset());
3820 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3821 assert(vte_size == wordSize, "adjust times_vte_scale");
3822
3823 Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
3824
3825 // temp_itbl_klass = recv_klass.itable[0]
3826 // scan_temp = &recv_klass.itable[0] + step
3827 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3828 movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
3829 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
3830 xorptr(temp_reg, temp_reg);
3831
3832 // Initial checks:
3833 // - if (holder_klass != resolved_klass), go to "scan for resolved"
3834 // - if (itable[0] == 0), no such interface
3835 // - if (itable[0] == holder_klass), shortcut to "holder found"
3836 cmpptr(holder_klass, resolved_klass);
3837 jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
3838 testptr(temp_itbl_klass, temp_itbl_klass);
3839 jccb(Assembler::zero, L_no_such_interface);
3840 cmpptr(holder_klass, temp_itbl_klass);
3841 jccb(Assembler::equal, L_holder_found);
3842
3843 // Loop: Look for holder_klass record in itable
3844 // do {
3845 // tmp = itable[index];
3846 // index += step;
3847 // if (tmp == holder_klass) {
3848 // goto L_holder_found; // Found!
3849 // }
3850 // } while (tmp != 0);
3851 // goto L_no_such_interface // Not found.
3852 Label L_scan_holder;
3853 bind(L_scan_holder);
3854 movptr(temp_itbl_klass, Address(scan_temp, 0));
3855 addptr(scan_temp, scan_step);
3856 cmpptr(holder_klass, temp_itbl_klass);
3857 jccb(Assembler::equal, L_holder_found);
3858 testptr(temp_itbl_klass, temp_itbl_klass);
3859 jccb(Assembler::notZero, L_scan_holder);
3860
3861 jmpb(L_no_such_interface);
3862
3863 // Loop: Look for resolved_class record in itable
3864 // do {
3865 // tmp = itable[index];
3866 // index += step;
3867 // if (tmp == holder_klass) {
3868 // // Also check if we have met a holder klass
3869 // holder_tmp = itable[index-step-ioffset];
3870 // }
3871 // if (tmp == resolved_klass) {
3872 // goto L_resolved_found; // Found!
3873 // }
3874 // } while (tmp != 0);
3875 // goto L_no_such_interface // Not found.
3876 //
3877 Label L_loop_scan_resolved;
3878 bind(L_loop_scan_resolved);
3879 movptr(temp_itbl_klass, Address(scan_temp, 0));
3880 addptr(scan_temp, scan_step);
3881 bind(L_loop_scan_resolved_entry);
3882 cmpptr(holder_klass, temp_itbl_klass);
3883 cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
3884 cmpptr(resolved_klass, temp_itbl_klass);
3885 jccb(Assembler::equal, L_resolved_found);
3886 testptr(temp_itbl_klass, temp_itbl_klass);
3887 jccb(Assembler::notZero, L_loop_scan_resolved);
3888
3889 jmpb(L_no_such_interface);
3890
3891 Label L_ready;
3892
3893 // See if we already have a holder klass. If not, go and scan for it.
3894 bind(L_resolved_found);
3895 testptr(temp_reg, temp_reg);
3896 jccb(Assembler::zero, L_scan_holder);
3897 jmpb(L_ready);
3898
3899 bind(L_holder_found);
3900 movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
3901
3902 // Finally, temp_reg contains holder_klass vtable offset
3903 bind(L_ready);
3904 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3905 if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
3906 load_klass(scan_temp, receiver, noreg);
3907 movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
3908 } else {
3909 movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
3910 }
3911 }
3912
3913
3914 // virtual method calling
3915 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3916 RegisterOrConstant vtable_index,
3917 Register method_result) {
3918 const ByteSize base = Klass::vtable_start_offset();
3919 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3920 Address vtable_entry_addr(recv_klass,
3921 vtable_index, Address::times_ptr,
3922 base + vtableEntry::method_offset());
3923 movptr(method_result, vtable_entry_addr);
3924 }
3925
3926
3927 void MacroAssembler::check_klass_subtype(Register sub_klass,
3928 Register super_klass,
3929 Register temp_reg,
3930 Label& L_success) {
3931 Label L_failure;
3932 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, nullptr);
3933 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
3934 bind(L_failure);
3935 }
3936
3937
3938 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3939 Register super_klass,
3940 Register temp_reg,
3941 Label* L_success,
3942 Label* L_failure,
3943 Label* L_slow_path,
3944 RegisterOrConstant super_check_offset) {
3945 assert_different_registers(sub_klass, super_klass, temp_reg);
3946 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3947 if (super_check_offset.is_register()) {
3948 assert_different_registers(sub_klass, super_klass,
3949 super_check_offset.as_register());
3950 } else if (must_load_sco) {
3951 assert(temp_reg != noreg, "supply either a temp or a register offset");
3952 }
3953
3954 Label L_fallthrough;
3955 int label_nulls = 0;
3956 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
3957 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
3958 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
3959 assert(label_nulls <= 1, "at most one null in the batch");
3960
3961 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3962 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3963 Address super_check_offset_addr(super_klass, sco_offset);
3964
3965 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3966 // range of a jccb. If this routine grows larger, reconsider at
3967 // least some of these.
3968 #define local_jcc(assembler_cond, label) \
3969 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
3970 else jcc( assembler_cond, label) /*omit semi*/
3971
3972 // Hacked jmp, which may only be used just before L_fallthrough.
3973 #define final_jmp(label) \
3974 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
3975 else jmp(label) /*omit semi*/
3976
3977 // If the pointers are equal, we are done (e.g., String[] elements).
3978 // This self-check enables sharing of secondary supertype arrays among
3979 // non-primary types such as array-of-interface. Otherwise, each such
3980 // type would need its own customized SSA.
3981 // We move this check to the front of the fast path because many
3982 // type checks are in fact trivially successful in this manner,
3983 // so we get a nicely predicted branch right at the start of the check.
3984 cmpptr(sub_klass, super_klass);
3985 local_jcc(Assembler::equal, *L_success);
3986
3987 // Check the supertype display:
3988 if (must_load_sco) {
3989 // Positive movl does right thing on LP64.
3990 movl(temp_reg, super_check_offset_addr);
3991 super_check_offset = RegisterOrConstant(temp_reg);
3992 }
3993 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3994 cmpptr(super_klass, super_check_addr); // load displayed supertype
3995
3996 // This check has worked decisively for primary supers.
3997 // Secondary supers are sought in the super_cache ('super_cache_addr').
3998 // (Secondary supers are interfaces and very deeply nested subtypes.)
3999 // This works in the same check above because of a tricky aliasing
4000 // between the super_cache and the primary super display elements.
4001 // (The 'super_check_addr' can address either, as the case requires.)
4002 // Note that the cache is updated below if it does not help us find
4003 // what we need immediately.
4004 // So if it was a primary super, we can just fail immediately.
4005 // Otherwise, it's the slow path for us (no success at this point).
4006
4007 if (super_check_offset.is_register()) {
4008 local_jcc(Assembler::equal, *L_success);
4009 cmpl(super_check_offset.as_register(), sc_offset);
4010 if (L_failure == &L_fallthrough) {
4011 local_jcc(Assembler::equal, *L_slow_path);
4012 } else {
4013 local_jcc(Assembler::notEqual, *L_failure);
4014 final_jmp(*L_slow_path);
4015 }
4016 } else if (super_check_offset.as_constant() == sc_offset) {
4017 // Need a slow path; fast failure is impossible.
4018 if (L_slow_path == &L_fallthrough) {
4019 local_jcc(Assembler::equal, *L_success);
4020 } else {
4021 local_jcc(Assembler::notEqual, *L_slow_path);
4022 final_jmp(*L_success);
4023 }
4024 } else {
4025 // No slow path; it's a fast decision.
4026 if (L_failure == &L_fallthrough) {
4027 local_jcc(Assembler::equal, *L_success);
4028 } else {
4029 local_jcc(Assembler::notEqual, *L_failure);
4030 final_jmp(*L_success);
4031 }
4032 }
4033
4034 bind(L_fallthrough);
4035
4036 #undef local_jcc
4037 #undef final_jmp
4038 }
4039
4040
4041 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4042 Register super_klass,
4043 Register temp_reg,
4044 Register temp2_reg,
4045 Label* L_success,
4046 Label* L_failure,
4047 bool set_cond_codes) {
4048 assert_different_registers(sub_klass, super_klass, temp_reg);
4049 if (temp2_reg != noreg)
4050 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4051 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4052
4053 Label L_fallthrough;
4054 int label_nulls = 0;
4055 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4056 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4057 assert(label_nulls <= 1, "at most one null in the batch");
4058
4059 // a couple of useful fields in sub_klass:
4060 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4061 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4062 Address secondary_supers_addr(sub_klass, ss_offset);
4063 Address super_cache_addr( sub_klass, sc_offset);
4064
4065 // Do a linear scan of the secondary super-klass chain.
4066 // This code is rarely used, so simplicity is a virtue here.
4067 // The repne_scan instruction uses fixed registers, which we must spill.
4068 // Don't worry too much about pre-existing connections with the input regs.
4069
4070 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4071 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4072
4073 // Get super_klass value into rax (even if it was in rdi or rcx).
4074 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4075 if (super_klass != rax) {
4076 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4077 mov(rax, super_klass);
4078 }
4079 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4080 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4081
4082 #ifndef PRODUCT
4083 uint* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4084 ExternalAddress pst_counter_addr((address) pst_counter);
4085 lea(rcx, pst_counter_addr);
4086 incrementl(Address(rcx, 0));
4087 #endif //PRODUCT
4088
4089 // We will consult the secondary-super array.
4090 movptr(rdi, secondary_supers_addr);
4091 // Load the array length. (Positive movl does right thing on LP64.)
4092 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4093 // Skip to start of data.
4094 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4095
4096 // Scan RCX words at [RDI] for an occurrence of RAX.
4097 // Set NZ/Z based on last compare.
4098 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4099 // not change flags (only scas instruction which is repeated sets flags).
4100 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4101
4102 testptr(rax,rax); // Set Z = 0
4103 repne_scan();
4104
4105 // Unspill the temp. registers:
4106 if (pushed_rdi) pop(rdi);
4107 if (pushed_rcx) pop(rcx);
4108 if (pushed_rax) pop(rax);
4109
4110 if (set_cond_codes) {
4111 // Special hack for the AD files: rdi is guaranteed non-zero.
4112 assert(!pushed_rdi, "rdi must be left non-null");
4113 // Also, the condition codes are properly set Z/NZ on succeed/failure.
4114 }
4115
4116 if (L_failure == &L_fallthrough)
4117 jccb(Assembler::notEqual, *L_failure);
4118 else jcc(Assembler::notEqual, *L_failure);
4119
4120 // Success. Cache the super we found and proceed in triumph.
4121 movptr(super_cache_addr, super_klass);
4122
4123 if (L_success != &L_fallthrough) {
4124 jmp(*L_success);
4125 }
4126
4127 #undef IS_A_TEMP
4128
4129 bind(L_fallthrough);
4130 }
4131
4132 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4133 Register super_klass,
4134 Register temp_reg,
4135 Register temp2_reg,
4136 Label* L_success,
4137 Label* L_failure,
4138 bool set_cond_codes) {
4139 assert(set_cond_codes == false, "must be false on 64-bit x86");
4140 check_klass_subtype_slow_path
4141 (sub_klass, super_klass, temp_reg, temp2_reg, noreg, noreg,
4142 L_success, L_failure);
4143 }
4144
4145 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4146 Register super_klass,
4147 Register temp_reg,
4148 Register temp2_reg,
4149 Register temp3_reg,
4150 Register temp4_reg,
4151 Label* L_success,
4152 Label* L_failure) {
4153 if (UseSecondarySupersTable) {
4154 check_klass_subtype_slow_path_table
4155 (sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, temp4_reg,
4156 L_success, L_failure);
4157 } else {
4158 check_klass_subtype_slow_path_linear
4159 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, /*set_cond_codes*/false);
4160 }
4161 }
4162
4163 Register MacroAssembler::allocate_if_noreg(Register r,
4164 RegSetIterator<Register> &available_regs,
4165 RegSet ®s_to_push) {
4166 if (!r->is_valid()) {
4167 r = *available_regs++;
4168 regs_to_push += r;
4169 }
4170 return r;
4171 }
4172
4173 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4174 Register super_klass,
4175 Register temp_reg,
4176 Register temp2_reg,
4177 Register temp3_reg,
4178 Register result_reg,
4179 Label* L_success,
4180 Label* L_failure) {
4181 // NB! Callers may assume that, when temp2_reg is a valid register,
4182 // this code sets it to a nonzero value.
4183 bool temp2_reg_was_valid = temp2_reg->is_valid();
4184
4185 RegSet temps = RegSet::of(temp_reg, temp2_reg, temp3_reg);
4186
4187 Label L_fallthrough;
4188 int label_nulls = 0;
4189 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4190 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4191 assert(label_nulls <= 1, "at most one null in the batch");
4192
4193 BLOCK_COMMENT("check_klass_subtype_slow_path_table");
4194
4195 RegSetIterator<Register> available_regs
4196 = (RegSet::of(rax, rcx, rdx, r8) + r9 + r10 + r11 + r12 - temps - sub_klass - super_klass).begin();
4197
4198 RegSet pushed_regs;
4199
4200 temp_reg = allocate_if_noreg(temp_reg, available_regs, pushed_regs);
4201 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
4202 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
4203 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4204 Register temp4_reg = allocate_if_noreg(noreg, available_regs, pushed_regs);
4205
4206 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, result_reg);
4207
4208 {
4209
4210 int register_push_size = pushed_regs.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4211 int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4212 subptr(rsp, aligned_size);
4213 push_set(pushed_regs, 0);
4214
4215 lookup_secondary_supers_table_var(sub_klass,
4216 super_klass,
4217 temp_reg, temp2_reg, temp3_reg, temp4_reg, result_reg);
4218 cmpq(result_reg, 0);
4219
4220 // Unspill the temp. registers:
4221 pop_set(pushed_regs, 0);
4222 // Increment SP but do not clobber flags.
4223 lea(rsp, Address(rsp, aligned_size));
4224 }
4225
4226 if (temp2_reg_was_valid) {
4227 movq(temp2_reg, 1);
4228 }
4229
4230 jcc(Assembler::notEqual, *L_failure);
4231
4232 if (L_success != &L_fallthrough) {
4233 jmp(*L_success);
4234 }
4235
4236 bind(L_fallthrough);
4237 }
4238
4239 // population_count variant for running without the POPCNT
4240 // instruction, which was introduced with SSE4.2 in 2008.
4241 void MacroAssembler::population_count(Register dst, Register src,
4242 Register scratch1, Register scratch2) {
4243 assert_different_registers(src, scratch1, scratch2);
4244 if (UsePopCountInstruction) {
4245 Assembler::popcntq(dst, src);
4246 } else {
4247 assert_different_registers(src, scratch1, scratch2);
4248 assert_different_registers(dst, scratch1, scratch2);
4249 Label loop, done;
4250
4251 mov(scratch1, src);
4252 // dst = 0;
4253 // while(scratch1 != 0) {
4254 // dst++;
4255 // scratch1 &= (scratch1 - 1);
4256 // }
4257 xorl(dst, dst);
4258 testq(scratch1, scratch1);
4259 jccb(Assembler::equal, done);
4260 {
4261 bind(loop);
4262 incq(dst);
4263 movq(scratch2, scratch1);
4264 decq(scratch2);
4265 andq(scratch1, scratch2);
4266 jccb(Assembler::notEqual, loop);
4267 }
4268 bind(done);
4269 }
4270 #ifdef ASSERT
4271 mov64(scratch1, 0xCafeBabeDeadBeef);
4272 movq(scratch2, scratch1);
4273 #endif
4274 }
4275
4276 // Ensure that the inline code and the stub are using the same registers.
4277 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
4278 do { \
4279 assert(r_super_klass == rax, "mismatch"); \
4280 assert(r_array_base == rbx, "mismatch"); \
4281 assert(r_array_length == rcx, "mismatch"); \
4282 assert(r_array_index == rdx, "mismatch"); \
4283 assert(r_sub_klass == rsi || r_sub_klass == noreg, "mismatch"); \
4284 assert(r_bitmap == r11 || r_bitmap == noreg, "mismatch"); \
4285 assert(result == rdi || result == noreg, "mismatch"); \
4286 } while(0)
4287
4288 // Versions of salq and rorq that don't need count to be in rcx
4289
4290 void MacroAssembler::salq(Register dest, Register count) {
4291 if (count == rcx) {
4292 Assembler::salq(dest);
4293 } else {
4294 assert_different_registers(rcx, dest);
4295 xchgq(rcx, count);
4296 Assembler::salq(dest);
4297 xchgq(rcx, count);
4298 }
4299 }
4300
4301 void MacroAssembler::rorq(Register dest, Register count) {
4302 if (count == rcx) {
4303 Assembler::rorq(dest);
4304 } else {
4305 assert_different_registers(rcx, dest);
4306 xchgq(rcx, count);
4307 Assembler::rorq(dest);
4308 xchgq(rcx, count);
4309 }
4310 }
4311
4312 // Return true: we succeeded in generating this code
4313 //
4314 // At runtime, return 0 in result if r_super_klass is a superclass of
4315 // r_sub_klass, otherwise return nonzero. Use this if you know the
4316 // super_klass_slot of the class you're looking for. This is always
4317 // the case for instanceof and checkcast.
4318 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4319 Register r_super_klass,
4320 Register temp1,
4321 Register temp2,
4322 Register temp3,
4323 Register temp4,
4324 Register result,
4325 u1 super_klass_slot) {
4326 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4327
4328 Label L_fallthrough, L_success, L_failure;
4329
4330 BLOCK_COMMENT("lookup_secondary_supers_table {");
4331
4332 const Register
4333 r_array_index = temp1,
4334 r_array_length = temp2,
4335 r_array_base = temp3,
4336 r_bitmap = temp4;
4337
4338 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4339
4340 xorq(result, result); // = 0
4341
4342 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4343 movq(r_array_index, r_bitmap);
4344
4345 // First check the bitmap to see if super_klass might be present. If
4346 // the bit is zero, we are certain that super_klass is not one of
4347 // the secondary supers.
4348 u1 bit = super_klass_slot;
4349 {
4350 // NB: If the count in a x86 shift instruction is 0, the flags are
4351 // not affected, so we do a testq instead.
4352 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
4353 if (shift_count != 0) {
4354 salq(r_array_index, shift_count);
4355 } else {
4356 testq(r_array_index, r_array_index);
4357 }
4358 }
4359 // We test the MSB of r_array_index, i.e. its sign bit
4360 jcc(Assembler::positive, L_failure);
4361
4362 // Get the first array index that can contain super_klass into r_array_index.
4363 if (bit != 0) {
4364 population_count(r_array_index, r_array_index, temp2, temp3);
4365 } else {
4366 movl(r_array_index, 1);
4367 }
4368 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4369
4370 // We will consult the secondary-super array.
4371 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4372
4373 // We're asserting that the first word in an Array<Klass*> is the
4374 // length, and the second word is the first word of the data. If
4375 // that ever changes, r_array_base will have to be adjusted here.
4376 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4377 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4378
4379 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4380 jccb(Assembler::equal, L_success);
4381
4382 // Is there another entry to check? Consult the bitmap.
4383 btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4384 jccb(Assembler::carryClear, L_failure);
4385
4386 // Linear probe. Rotate the bitmap so that the next bit to test is
4387 // in Bit 1.
4388 if (bit != 0) {
4389 rorq(r_bitmap, bit);
4390 }
4391
4392 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4393 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4394 // Kills: r_array_length.
4395 // Returns: result.
4396 call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub()));
4397 // Result (0/1) is in rdi
4398 jmpb(L_fallthrough);
4399
4400 bind(L_failure);
4401 incq(result); // 0 => 1
4402
4403 bind(L_success);
4404 // result = 0;
4405
4406 bind(L_fallthrough);
4407 BLOCK_COMMENT("} lookup_secondary_supers_table");
4408
4409 if (VerifySecondarySupers) {
4410 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4411 temp1, temp2, temp3);
4412 }
4413 }
4414
4415 // At runtime, return 0 in result if r_super_klass is a superclass of
4416 // r_sub_klass, otherwise return nonzero. Use this version of
4417 // lookup_secondary_supers_table() if you don't know ahead of time
4418 // which superclass will be searched for. Used by interpreter and
4419 // runtime stubs. It is larger and has somewhat greater latency than
4420 // the version above, which takes a constant super_klass_slot.
4421 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4422 Register r_super_klass,
4423 Register temp1,
4424 Register temp2,
4425 Register temp3,
4426 Register temp4,
4427 Register result) {
4428 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4429 assert_different_registers(r_sub_klass, r_super_klass, rcx);
4430 RegSet temps = RegSet::of(temp1, temp2, temp3, temp4);
4431
4432 Label L_fallthrough, L_success, L_failure;
4433
4434 BLOCK_COMMENT("lookup_secondary_supers_table {");
4435
4436 RegSetIterator<Register> available_regs = (temps - rcx).begin();
4437
4438 // FIXME. Once we are sure that all paths reaching this point really
4439 // do pass rcx as one of our temps we can get rid of the following
4440 // workaround.
4441 assert(temps.contains(rcx), "fix this code");
4442
4443 // We prefer to have our shift count in rcx. If rcx is one of our
4444 // temps, use it for slot. If not, pick any of our temps.
4445 Register slot;
4446 if (!temps.contains(rcx)) {
4447 slot = *available_regs++;
4448 } else {
4449 slot = rcx;
4450 }
4451
4452 const Register r_array_index = *available_regs++;
4453 const Register r_bitmap = *available_regs++;
4454
4455 // The logic above guarantees this property, but we state it here.
4456 assert_different_registers(r_array_index, r_bitmap, rcx);
4457
4458 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4459 movq(r_array_index, r_bitmap);
4460
4461 // First check the bitmap to see if super_klass might be present. If
4462 // the bit is zero, we are certain that super_klass is not one of
4463 // the secondary supers.
4464 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4465 xorl(slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1)); // slot ^ 63 === 63 - slot (mod 64)
4466 salq(r_array_index, slot);
4467
4468 testq(r_array_index, r_array_index);
4469 // We test the MSB of r_array_index, i.e. its sign bit
4470 jcc(Assembler::positive, L_failure);
4471
4472 const Register r_array_base = *available_regs++;
4473
4474 // Get the first array index that can contain super_klass into r_array_index.
4475 // Note: Clobbers r_array_base and slot.
4476 population_count(r_array_index, r_array_index, /*temp2*/r_array_base, /*temp3*/slot);
4477
4478 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4479
4480 // We will consult the secondary-super array.
4481 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4482
4483 // We're asserting that the first word in an Array<Klass*> is the
4484 // length, and the second word is the first word of the data. If
4485 // that ever changes, r_array_base will have to be adjusted here.
4486 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4487 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4488
4489 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4490 jccb(Assembler::equal, L_success);
4491
4492 // Restore slot to its true value
4493 movb(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4494
4495 // Linear probe. Rotate the bitmap so that the next bit to test is
4496 // in Bit 1.
4497 rorq(r_bitmap, slot);
4498
4499 // Is there another entry to check? Consult the bitmap.
4500 btq(r_bitmap, 1);
4501 jccb(Assembler::carryClear, L_failure);
4502
4503 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4504 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4505 // Kills: r_array_length.
4506 // Returns: result.
4507 lookup_secondary_supers_table_slow_path(r_super_klass,
4508 r_array_base,
4509 r_array_index,
4510 r_bitmap,
4511 /*temp1*/result,
4512 /*temp2*/slot,
4513 &L_success,
4514 nullptr);
4515
4516 bind(L_failure);
4517 movq(result, 1);
4518 jmpb(L_fallthrough);
4519
4520 bind(L_success);
4521 xorq(result, result); // = 0
4522
4523 bind(L_fallthrough);
4524 BLOCK_COMMENT("} lookup_secondary_supers_table");
4525
4526 if (VerifySecondarySupers) {
4527 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4528 temp1, temp2, temp3);
4529 }
4530 }
4531
4532 void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit,
4533 Label* L_success, Label* L_failure) {
4534 Label L_loop, L_fallthrough;
4535 {
4536 int label_nulls = 0;
4537 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4538 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4539 assert(label_nulls <= 1, "at most one null in the batch");
4540 }
4541 bind(L_loop);
4542 cmpq(value, Address(addr, count, Address::times_8));
4543 jcc(Assembler::equal, *L_success);
4544 addl(count, 1);
4545 cmpl(count, limit);
4546 jcc(Assembler::less, L_loop);
4547
4548 if (&L_fallthrough != L_failure) {
4549 jmp(*L_failure);
4550 }
4551 bind(L_fallthrough);
4552 }
4553
4554 // Called by code generated by check_klass_subtype_slow_path
4555 // above. This is called when there is a collision in the hashed
4556 // lookup in the secondary supers array.
4557 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4558 Register r_array_base,
4559 Register r_array_index,
4560 Register r_bitmap,
4561 Register temp1,
4562 Register temp2,
4563 Label* L_success,
4564 Label* L_failure) {
4565 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2);
4566
4567 const Register
4568 r_array_length = temp1,
4569 r_sub_klass = noreg,
4570 result = noreg;
4571
4572 Label L_fallthrough;
4573 int label_nulls = 0;
4574 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4575 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4576 assert(label_nulls <= 1, "at most one null in the batch");
4577
4578 // Load the array length.
4579 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4580 // And adjust the array base to point to the data.
4581 // NB! Effectively increments current slot index by 1.
4582 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4583 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4584
4585 // Linear probe
4586 Label L_huge;
4587
4588 // The bitmap is full to bursting.
4589 // Implicit invariant: BITMAP_FULL implies (length > 0)
4590 cmpl(r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4591 jcc(Assembler::greater, L_huge);
4592
4593 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4594 // current slot (at secondary_supers[r_array_index]) has not yet
4595 // been inspected, and r_array_index may be out of bounds if we
4596 // wrapped around the end of the array.
4597
4598 { // This is conventional linear probing, but instead of terminating
4599 // when a null entry is found in the table, we maintain a bitmap
4600 // in which a 0 indicates missing entries.
4601 // The check above guarantees there are 0s in the bitmap, so the loop
4602 // eventually terminates.
4603
4604 xorl(temp2, temp2); // = 0;
4605
4606 Label L_again;
4607 bind(L_again);
4608
4609 // Check for array wraparound.
4610 cmpl(r_array_index, r_array_length);
4611 cmovl(Assembler::greaterEqual, r_array_index, temp2);
4612
4613 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4614 jcc(Assembler::equal, *L_success);
4615
4616 // If the next bit in bitmap is zero, we're done.
4617 btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now
4618 jcc(Assembler::carryClear, *L_failure);
4619
4620 rorq(r_bitmap, 1); // Bits 1/2 => 0/1
4621 addl(r_array_index, 1);
4622
4623 jmp(L_again);
4624 }
4625
4626 { // Degenerate case: more than 64 secondary supers.
4627 // FIXME: We could do something smarter here, maybe a vectorized
4628 // comparison or a binary search, but is that worth any added
4629 // complexity?
4630 bind(L_huge);
4631 xorl(r_array_index, r_array_index); // = 0
4632 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length,
4633 L_success,
4634 (&L_fallthrough != L_failure ? L_failure : nullptr));
4635
4636 bind(L_fallthrough);
4637 }
4638 }
4639
4640 struct VerifyHelperArguments {
4641 Klass* _super;
4642 Klass* _sub;
4643 intptr_t _linear_result;
4644 intptr_t _table_result;
4645 };
4646
4647 static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) {
4648 Klass::on_secondary_supers_verification_failure(args->_super,
4649 args->_sub,
4650 args->_linear_result,
4651 args->_table_result,
4652 msg);
4653 }
4654
4655 // Make sure that the hashed lookup and a linear scan agree.
4656 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4657 Register r_super_klass,
4658 Register result,
4659 Register temp1,
4660 Register temp2,
4661 Register temp3) {
4662 const Register
4663 r_array_index = temp1,
4664 r_array_length = temp2,
4665 r_array_base = temp3,
4666 r_bitmap = noreg;
4667
4668 BLOCK_COMMENT("verify_secondary_supers_table {");
4669
4670 Label L_success, L_failure, L_check, L_done;
4671
4672 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4673 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4674 // And adjust the array base to point to the data.
4675 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4676
4677 testl(r_array_length, r_array_length); // array_length == 0?
4678 jcc(Assembler::zero, L_failure);
4679
4680 movl(r_array_index, 0);
4681 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success);
4682 // fall through to L_failure
4683
4684 const Register linear_result = r_array_index; // reuse temp1
4685
4686 bind(L_failure); // not present
4687 movl(linear_result, 1);
4688 jmp(L_check);
4689
4690 bind(L_success); // present
4691 movl(linear_result, 0);
4692
4693 bind(L_check);
4694 cmpl(linear_result, result);
4695 jcc(Assembler::equal, L_done);
4696
4697 { // To avoid calling convention issues, build a record on the stack
4698 // and pass the pointer to that instead.
4699 push(result);
4700 push(linear_result);
4701 push(r_sub_klass);
4702 push(r_super_klass);
4703 movptr(c_rarg1, rsp);
4704 movptr(c_rarg0, (uintptr_t) "mismatch");
4705 call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper)));
4706 should_not_reach_here();
4707 }
4708 bind(L_done);
4709
4710 BLOCK_COMMENT("} verify_secondary_supers_table");
4711 }
4712
4713 #undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS
4714
4715 void MacroAssembler::clinit_barrier(Register klass, Label* L_fast_path, Label* L_slow_path) {
4716 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
4717
4718 Label L_fallthrough;
4719 if (L_fast_path == nullptr) {
4720 L_fast_path = &L_fallthrough;
4721 } else if (L_slow_path == nullptr) {
4722 L_slow_path = &L_fallthrough;
4723 }
4724
4725 // Fast path check: class is fully initialized.
4726 // init_state needs acquire, but x86 is TSO, and so we are already good.
4727 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4728 jcc(Assembler::equal, *L_fast_path);
4729
4730 // Fast path check: current thread is initializer thread
4731 cmpptr(r15_thread, Address(klass, InstanceKlass::init_thread_offset()));
4732 if (L_slow_path == &L_fallthrough) {
4733 jcc(Assembler::equal, *L_fast_path);
4734 bind(*L_slow_path);
4735 } else if (L_fast_path == &L_fallthrough) {
4736 jcc(Assembler::notEqual, *L_slow_path);
4737 bind(*L_fast_path);
4738 } else {
4739 Unimplemented();
4740 }
4741 }
4742
4743 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4744 if (VM_Version::supports_cmov()) {
4745 cmovl(cc, dst, src);
4746 } else {
4747 Label L;
4748 jccb(negate_condition(cc), L);
4749 movl(dst, src);
4750 bind(L);
4751 }
4752 }
4753
4754 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4755 if (VM_Version::supports_cmov()) {
4756 cmovl(cc, dst, src);
4757 } else {
4758 Label L;
4759 jccb(negate_condition(cc), L);
4760 movl(dst, src);
4761 bind(L);
4762 }
4763 }
4764
4765 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4766 if (!VerifyOops) return;
4767
4768 BLOCK_COMMENT("verify_oop {");
4769 push(rscratch1);
4770 push(rax); // save rax
4771 push(reg); // pass register argument
4772
4773 // Pass register number to verify_oop_subroutine
4774 const char* b = nullptr;
4775 {
4776 ResourceMark rm;
4777 stringStream ss;
4778 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4779 b = code_string(ss.as_string());
4780 }
4781 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
4782 pushptr(buffer.addr(), rscratch1);
4783
4784 // call indirectly to solve generation ordering problem
4785 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4786 call(rax);
4787 // Caller pops the arguments (oop, message) and restores rax, r10
4788 BLOCK_COMMENT("} verify_oop");
4789 }
4790
4791 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4792 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4793 // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
4794 // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
4795 vpternlogd(dst, 0xFF, dst, dst, vector_len);
4796 } else if (VM_Version::supports_avx()) {
4797 vpcmpeqd(dst, dst, dst, vector_len);
4798 } else {
4799 pcmpeqd(dst, dst);
4800 }
4801 }
4802
4803 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4804 int extra_slot_offset) {
4805 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4806 int stackElementSize = Interpreter::stackElementSize;
4807 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4808 #ifdef ASSERT
4809 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4810 assert(offset1 - offset == stackElementSize, "correct arithmetic");
4811 #endif
4812 Register scale_reg = noreg;
4813 Address::ScaleFactor scale_factor = Address::no_scale;
4814 if (arg_slot.is_constant()) {
4815 offset += arg_slot.as_constant() * stackElementSize;
4816 } else {
4817 scale_reg = arg_slot.as_register();
4818 scale_factor = Address::times(stackElementSize);
4819 }
4820 offset += wordSize; // return PC is on stack
4821 return Address(rsp, scale_reg, scale_factor, offset);
4822 }
4823
4824 // Handle the receiver type profile update given the "recv" klass.
4825 //
4826 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
4827 // If there are no matching or claimable receiver entries in RD, updates
4828 // the polymorphic counter.
4829 //
4830 // This code expected to run by either the interpreter or JIT-ed code, without
4831 // extra synchronization. For safety, receiver cells are claimed atomically, which
4832 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
4833 // counter updates are not atomic.
4834 //
4835 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
4836 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
4837 int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
4838 int poly_count_offset = in_bytes(CounterData::count_offset());
4839 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
4840 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
4841
4842 // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
4843 assert(is_aligned(mdp_offset, BytesPerWord), "sanity");
4844 base_receiver_offset += mdp_offset;
4845 end_receiver_offset += mdp_offset;
4846 poly_count_offset += mdp_offset;
4847
4848 // Scale down to optimize encoding. Slots are pointer-sized.
4849 assert(is_aligned(base_receiver_offset, BytesPerWord), "sanity");
4850 assert(is_aligned(end_receiver_offset, BytesPerWord), "sanity");
4851 assert(is_aligned(poly_count_offset, BytesPerWord), "sanity");
4852 assert(is_aligned(receiver_step, BytesPerWord), "sanity");
4853 assert(is_aligned(receiver_to_count_step, BytesPerWord), "sanity");
4854 base_receiver_offset >>= LogBytesPerWord;
4855 end_receiver_offset >>= LogBytesPerWord;
4856 poly_count_offset >>= LogBytesPerWord;
4857 receiver_step >>= LogBytesPerWord;
4858 receiver_to_count_step >>= LogBytesPerWord;
4859
4860 #ifdef ASSERT
4861 // We are about to walk the MDO slots without asking for offsets.
4862 // Check that our math hits all the right spots.
4863 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
4864 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
4865 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
4866 int offset = base_receiver_offset + receiver_step*c;
4867 int count_offset = offset + receiver_to_count_step;
4868 assert((offset << LogBytesPerWord) == real_recv_offset, "receiver slot math");
4869 assert((count_offset << LogBytesPerWord) == real_count_offset, "receiver count math");
4870 }
4871 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
4872 assert(poly_count_offset << LogBytesPerWord == real_poly_count_offset, "poly counter math");
4873 #endif
4874
4875 // Corner case: no profile table. Increment poly counter and exit.
4876 if (ReceiverTypeData::row_limit() == 0) {
4877 addptr(Address(mdp, poly_count_offset, Address::times_ptr), DataLayout::counter_increment);
4878 return;
4879 }
4880
4881 Register offset = rscratch1;
4882
4883 Label L_loop_search_receiver, L_loop_search_empty;
4884 Label L_restart, L_found_recv, L_found_empty, L_polymorphic, L_count_update;
4885
4886 // The code here recognizes three major cases:
4887 // A. Fastest: receiver found in the table
4888 // B. Fast: no receiver in the table, and the table is full
4889 // C. Slow: no receiver in the table, free slots in the table
4890 //
4891 // The case A performance is most important, as perfectly-behaved code would end up
4892 // there, especially with larger TypeProfileWidth. The case B performance is
4893 // important as well, this is where bulk of code would land for normally megamorphic
4894 // cases. The case C performance is not essential, its job is to deal with installation
4895 // races, we optimize for code density instead. Case C needs to make sure that receiver
4896 // rows are only claimed once. This makes sure we never overwrite a row for another
4897 // receiver and never duplicate the receivers in the list, making profile type-accurate.
4898 //
4899 // It is very tempting to handle these cases in a single loop, and claim the first slot
4900 // without checking the rest of the table. But, profiling code should tolerate free slots
4901 // in the table, as class unloading can clear them. After such cleanup, the receiver
4902 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
4903 // to complete, before trying to install new slots. Splitting the code in several tight
4904 // loops also helpfully optimizes for cases A and B.
4905 //
4906 // This code is effectively:
4907 //
4908 // restart:
4909 // // Fastest: receiver is already installed
4910 // for (i = 0; i < receiver_count(); i++) {
4911 // if (receiver(i) == recv) goto found_recv(i);
4912 // }
4913 //
4914 // // Fast: no receiver, but profile is full
4915 // for (i = 0; i < receiver_count(); i++) {
4916 // if (receiver(i) == null) goto found_null(i);
4917 // }
4918 // goto polymorphic
4919 //
4920 // // Slow: try to install receiver
4921 // found_null(i):
4922 // CAS(&receiver(i), null, recv);
4923 // goto restart
4924 //
4925 // polymorphic:
4926 // count++;
4927 // return
4928 //
4929 // found_recv(i):
4930 // *receiver_count(i)++
4931 //
4932
4933 bind(L_restart);
4934
4935 // Fastest: receiver is already installed
4936 movptr(offset, base_receiver_offset);
4937 bind(L_loop_search_receiver);
4938 cmpptr(recv, Address(mdp, offset, Address::times_ptr));
4939 jccb(Assembler::equal, L_found_recv);
4940 addptr(offset, receiver_step);
4941 cmpptr(offset, end_receiver_offset);
4942 jccb(Assembler::notEqual, L_loop_search_receiver);
4943
4944 // Fast: no receiver, but profile is full
4945 movptr(offset, base_receiver_offset);
4946 bind(L_loop_search_empty);
4947 cmpptr(Address(mdp, offset, Address::times_ptr), NULL_WORD);
4948 jccb(Assembler::equal, L_found_empty);
4949 addptr(offset, receiver_step);
4950 cmpptr(offset, end_receiver_offset);
4951 jccb(Assembler::notEqual, L_loop_search_empty);
4952 jmpb(L_polymorphic);
4953
4954 // Slow: try to install receiver
4955 bind(L_found_empty);
4956
4957 // Atomically swing receiver slot: null -> recv.
4958 //
4959 // The update code uses CAS, which wants RAX register specifically, *and* it needs
4960 // other important registers untouched, as they form the address. Therefore, we need
4961 // to shift any important registers from RAX into some other spare register. If we
4962 // have a spare register, we are forced to save it on stack here.
4963
4964 Register spare_reg = noreg;
4965 Register shifted_mdp = mdp;
4966 Register shifted_recv = recv;
4967 if (recv == rax || mdp == rax) {
4968 spare_reg = (recv != rbx && mdp != rbx) ? rbx :
4969 (recv != rcx && mdp != rcx) ? rcx :
4970 rdx;
4971 assert_different_registers(mdp, recv, offset, spare_reg);
4972
4973 push(spare_reg);
4974 if (recv == rax) {
4975 movptr(spare_reg, recv);
4976 shifted_recv = spare_reg;
4977 } else {
4978 assert(mdp == rax, "Remaining case");
4979 movptr(spare_reg, mdp);
4980 shifted_mdp = spare_reg;
4981 }
4982 } else {
4983 push(rax);
4984 }
4985
4986 // None of the important registers are in RAX after this shuffle.
4987 assert_different_registers(rax, shifted_mdp, shifted_recv, offset);
4988
4989 xorptr(rax, rax);
4990 cmpxchgptr(shifted_recv, Address(shifted_mdp, offset, Address::times_ptr));
4991
4992 // Unshift registers.
4993 if (recv == rax || mdp == rax) {
4994 movptr(rax, spare_reg);
4995 pop(spare_reg);
4996 } else {
4997 pop(rax);
4998 }
4999
5000 // CAS success means the slot now has the receiver we want. CAS failure means
5001 // something had claimed the slot concurrently: it can be the same receiver we want,
5002 // or something else. Since this is a slow path, we can optimize for code density,
5003 // and just restart the search from the beginning.
5004 jmpb(L_restart);
5005
5006 // Counter updates:
5007
5008 // Increment polymorphic counter instead of receiver slot.
5009 bind(L_polymorphic);
5010 movptr(offset, poly_count_offset);
5011 jmpb(L_count_update);
5012
5013 // Found a receiver, convert its slot offset to corresponding count offset.
5014 bind(L_found_recv);
5015 addptr(offset, receiver_to_count_step);
5016
5017 bind(L_count_update);
5018 addptr(Address(mdp, offset, Address::times_ptr), DataLayout::counter_increment);
5019 }
5020
5021 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
5022 if (!VerifyOops) return;
5023
5024 push(rscratch1);
5025 push(rax); // save rax,
5026 // addr may contain rsp so we will have to adjust it based on the push
5027 // we just did (and on 64 bit we do two pushes)
5028 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5029 // stores rax into addr which is backwards of what was intended.
5030 if (addr.uses(rsp)) {
5031 lea(rax, addr);
5032 pushptr(Address(rax, 2 * BytesPerWord));
5033 } else {
5034 pushptr(addr);
5035 }
5036
5037 // Pass register number to verify_oop_subroutine
5038 const char* b = nullptr;
5039 {
5040 ResourceMark rm;
5041 stringStream ss;
5042 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
5043 b = code_string(ss.as_string());
5044 }
5045 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5046 pushptr(buffer.addr(), rscratch1);
5047
5048 // call indirectly to solve generation ordering problem
5049 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5050 call(rax);
5051 // Caller pops the arguments (addr, message) and restores rax, r10.
5052 }
5053
5054 void MacroAssembler::verify_tlab() {
5055 #ifdef ASSERT
5056 if (UseTLAB && VerifyOops) {
5057 Label next, ok;
5058 Register t1 = rsi;
5059
5060 push(t1);
5061
5062 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5063 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_start_offset())));
5064 jcc(Assembler::aboveEqual, next);
5065 STOP("assert(top >= start)");
5066 should_not_reach_here();
5067
5068 bind(next);
5069 movptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_end_offset())));
5070 cmpptr(t1, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5071 jcc(Assembler::aboveEqual, ok);
5072 STOP("assert(top <= end)");
5073 should_not_reach_here();
5074
5075 bind(ok);
5076 pop(t1);
5077 }
5078 #endif
5079 }
5080
5081 class ControlWord {
5082 public:
5083 int32_t _value;
5084
5085 int rounding_control() const { return (_value >> 10) & 3 ; }
5086 int precision_control() const { return (_value >> 8) & 3 ; }
5087 bool precision() const { return ((_value >> 5) & 1) != 0; }
5088 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5089 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5090 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5091 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5092 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5093
5094 void print() const {
5095 // rounding control
5096 const char* rc;
5097 switch (rounding_control()) {
5098 case 0: rc = "round near"; break;
5099 case 1: rc = "round down"; break;
5100 case 2: rc = "round up "; break;
5101 case 3: rc = "chop "; break;
5102 default:
5103 rc = nullptr; // silence compiler warnings
5104 fatal("Unknown rounding control: %d", rounding_control());
5105 };
5106 // precision control
5107 const char* pc;
5108 switch (precision_control()) {
5109 case 0: pc = "24 bits "; break;
5110 case 1: pc = "reserved"; break;
5111 case 2: pc = "53 bits "; break;
5112 case 3: pc = "64 bits "; break;
5113 default:
5114 pc = nullptr; // silence compiler warnings
5115 fatal("Unknown precision control: %d", precision_control());
5116 };
5117 // flags
5118 char f[9];
5119 f[0] = ' ';
5120 f[1] = ' ';
5121 f[2] = (precision ()) ? 'P' : 'p';
5122 f[3] = (underflow ()) ? 'U' : 'u';
5123 f[4] = (overflow ()) ? 'O' : 'o';
5124 f[5] = (zero_divide ()) ? 'Z' : 'z';
5125 f[6] = (denormalized()) ? 'D' : 'd';
5126 f[7] = (invalid ()) ? 'I' : 'i';
5127 f[8] = '\x0';
5128 // output
5129 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5130 }
5131
5132 };
5133
5134 class StatusWord {
5135 public:
5136 int32_t _value;
5137
5138 bool busy() const { return ((_value >> 15) & 1) != 0; }
5139 bool C3() const { return ((_value >> 14) & 1) != 0; }
5140 bool C2() const { return ((_value >> 10) & 1) != 0; }
5141 bool C1() const { return ((_value >> 9) & 1) != 0; }
5142 bool C0() const { return ((_value >> 8) & 1) != 0; }
5143 int top() const { return (_value >> 11) & 7 ; }
5144 bool error_status() const { return ((_value >> 7) & 1) != 0; }
5145 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
5146 bool precision() const { return ((_value >> 5) & 1) != 0; }
5147 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5148 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5149 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5150 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5151 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5152
5153 void print() const {
5154 // condition codes
5155 char c[5];
5156 c[0] = (C3()) ? '3' : '-';
5157 c[1] = (C2()) ? '2' : '-';
5158 c[2] = (C1()) ? '1' : '-';
5159 c[3] = (C0()) ? '0' : '-';
5160 c[4] = '\x0';
5161 // flags
5162 char f[9];
5163 f[0] = (error_status()) ? 'E' : '-';
5164 f[1] = (stack_fault ()) ? 'S' : '-';
5165 f[2] = (precision ()) ? 'P' : '-';
5166 f[3] = (underflow ()) ? 'U' : '-';
5167 f[4] = (overflow ()) ? 'O' : '-';
5168 f[5] = (zero_divide ()) ? 'Z' : '-';
5169 f[6] = (denormalized()) ? 'D' : '-';
5170 f[7] = (invalid ()) ? 'I' : '-';
5171 f[8] = '\x0';
5172 // output
5173 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
5174 }
5175
5176 };
5177
5178 class TagWord {
5179 public:
5180 int32_t _value;
5181
5182 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
5183
5184 void print() const {
5185 printf("%04x", _value & 0xFFFF);
5186 }
5187
5188 };
5189
5190 class FPU_Register {
5191 public:
5192 int32_t _m0;
5193 int32_t _m1;
5194 int16_t _ex;
5195
5196 bool is_indefinite() const {
5197 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5198 }
5199
5200 void print() const {
5201 char sign = (_ex < 0) ? '-' : '+';
5202 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
5203 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
5204 };
5205
5206 };
5207
5208 class FPU_State {
5209 public:
5210 enum {
5211 register_size = 10,
5212 number_of_registers = 8,
5213 register_mask = 7
5214 };
5215
5216 ControlWord _control_word;
5217 StatusWord _status_word;
5218 TagWord _tag_word;
5219 int32_t _error_offset;
5220 int32_t _error_selector;
5221 int32_t _data_offset;
5222 int32_t _data_selector;
5223 int8_t _register[register_size * number_of_registers];
5224
5225 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5226 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
5227
5228 const char* tag_as_string(int tag) const {
5229 switch (tag) {
5230 case 0: return "valid";
5231 case 1: return "zero";
5232 case 2: return "special";
5233 case 3: return "empty";
5234 }
5235 ShouldNotReachHere();
5236 return nullptr;
5237 }
5238
5239 void print() const {
5240 // print computation registers
5241 { int t = _status_word.top();
5242 for (int i = 0; i < number_of_registers; i++) {
5243 int j = (i - t) & register_mask;
5244 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5245 st(j)->print();
5246 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5247 }
5248 }
5249 printf("\n");
5250 // print control registers
5251 printf("ctrl = "); _control_word.print(); printf("\n");
5252 printf("stat = "); _status_word .print(); printf("\n");
5253 printf("tags = "); _tag_word .print(); printf("\n");
5254 }
5255
5256 };
5257
5258 class Flag_Register {
5259 public:
5260 int32_t _value;
5261
5262 bool overflow() const { return ((_value >> 11) & 1) != 0; }
5263 bool direction() const { return ((_value >> 10) & 1) != 0; }
5264 bool sign() const { return ((_value >> 7) & 1) != 0; }
5265 bool zero() const { return ((_value >> 6) & 1) != 0; }
5266 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
5267 bool parity() const { return ((_value >> 2) & 1) != 0; }
5268 bool carry() const { return ((_value >> 0) & 1) != 0; }
5269
5270 void print() const {
5271 // flags
5272 char f[8];
5273 f[0] = (overflow ()) ? 'O' : '-';
5274 f[1] = (direction ()) ? 'D' : '-';
5275 f[2] = (sign ()) ? 'S' : '-';
5276 f[3] = (zero ()) ? 'Z' : '-';
5277 f[4] = (auxiliary_carry()) ? 'A' : '-';
5278 f[5] = (parity ()) ? 'P' : '-';
5279 f[6] = (carry ()) ? 'C' : '-';
5280 f[7] = '\x0';
5281 // output
5282 printf("%08x flags = %s", _value, f);
5283 }
5284
5285 };
5286
5287 class IU_Register {
5288 public:
5289 int32_t _value;
5290
5291 void print() const {
5292 printf("%08x %11d", _value, _value);
5293 }
5294
5295 };
5296
5297 class IU_State {
5298 public:
5299 Flag_Register _eflags;
5300 IU_Register _rdi;
5301 IU_Register _rsi;
5302 IU_Register _rbp;
5303 IU_Register _rsp;
5304 IU_Register _rbx;
5305 IU_Register _rdx;
5306 IU_Register _rcx;
5307 IU_Register _rax;
5308
5309 void print() const {
5310 // computation registers
5311 printf("rax, = "); _rax.print(); printf("\n");
5312 printf("rbx, = "); _rbx.print(); printf("\n");
5313 printf("rcx = "); _rcx.print(); printf("\n");
5314 printf("rdx = "); _rdx.print(); printf("\n");
5315 printf("rdi = "); _rdi.print(); printf("\n");
5316 printf("rsi = "); _rsi.print(); printf("\n");
5317 printf("rbp, = "); _rbp.print(); printf("\n");
5318 printf("rsp = "); _rsp.print(); printf("\n");
5319 printf("\n");
5320 // control registers
5321 printf("flgs = "); _eflags.print(); printf("\n");
5322 }
5323 };
5324
5325
5326 class CPU_State {
5327 public:
5328 FPU_State _fpu_state;
5329 IU_State _iu_state;
5330
5331 void print() const {
5332 printf("--------------------------------------------------\n");
5333 _iu_state .print();
5334 printf("\n");
5335 _fpu_state.print();
5336 printf("--------------------------------------------------\n");
5337 }
5338
5339 };
5340
5341
5342 static void _print_CPU_state(CPU_State* state) {
5343 state->print();
5344 };
5345
5346
5347 void MacroAssembler::print_CPU_state() {
5348 push_CPU_state();
5349 push(rsp); // pass CPU state
5350 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5351 addptr(rsp, wordSize); // discard argument
5352 pop_CPU_state();
5353 }
5354
5355 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5356 // Either restore the MXCSR register after returning from the JNI Call
5357 // or verify that it wasn't changed (with -Xcheck:jni flag).
5358 if (RestoreMXCSROnJNICalls) {
5359 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5360 } else if (CheckJNICalls) {
5361 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5362 }
5363 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5364 vzeroupper();
5365 }
5366
5367 // ((OopHandle)result).resolve();
5368 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5369 assert_different_registers(result, tmp);
5370
5371 // Only 64 bit platforms support GCs that require a tmp register
5372 // Only IN_HEAP loads require a thread_tmp register
5373 // OopHandle::resolve is an indirection like jobject.
5374 access_load_at(T_OBJECT, IN_NATIVE,
5375 result, Address(result, 0), tmp);
5376 }
5377
5378 // ((WeakHandle)result).resolve();
5379 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5380 assert_different_registers(rresult, rtmp);
5381 Label resolved;
5382
5383 // A null weak handle resolves to null.
5384 cmpptr(rresult, 0);
5385 jcc(Assembler::equal, resolved);
5386
5387 // Only 64 bit platforms support GCs that require a tmp register
5388 // Only IN_HEAP loads require a thread_tmp register
5389 // WeakHandle::resolve is an indirection like jweak.
5390 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5391 rresult, Address(rresult, 0), rtmp);
5392 bind(resolved);
5393 }
5394
5395 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5396 // get mirror
5397 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5398 load_method_holder(mirror, method);
5399 movptr(mirror, Address(mirror, mirror_offset));
5400 resolve_oop_handle(mirror, tmp);
5401 }
5402
5403 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5404 load_method_holder(rresult, rmethod);
5405 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5406 }
5407
5408 void MacroAssembler::load_method_holder(Register holder, Register method) {
5409 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
5410 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5411 movptr(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5412 }
5413
5414 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
5415 assert(UseCompactObjectHeaders, "expect compact object headers");
5416 movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
5417 shrq(dst, markWord::klass_shift);
5418 }
5419
5420 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5421 assert_different_registers(src, tmp);
5422 assert_different_registers(dst, tmp);
5423
5424 if (UseCompactObjectHeaders) {
5425 load_narrow_klass_compact(dst, src);
5426 decode_klass_not_null(dst, tmp);
5427 } else {
5428 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5429 decode_klass_not_null(dst, tmp);
5430 }
5431 }
5432
5433 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5434 assert(!UseCompactObjectHeaders, "not with compact headers");
5435 assert_different_registers(src, tmp);
5436 assert_different_registers(dst, tmp);
5437 encode_klass_not_null(src, tmp);
5438 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5439 }
5440
5441 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) {
5442 if (UseCompactObjectHeaders) {
5443 assert(tmp != noreg, "need tmp");
5444 assert_different_registers(klass, obj, tmp);
5445 load_narrow_klass_compact(tmp, obj);
5446 cmpl(klass, tmp);
5447 } else {
5448 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
5449 }
5450 }
5451
5452 void MacroAssembler::cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2) {
5453 if (UseCompactObjectHeaders) {
5454 assert(tmp2 != noreg, "need tmp2");
5455 assert_different_registers(obj1, obj2, tmp1, tmp2);
5456 load_narrow_klass_compact(tmp1, obj1);
5457 load_narrow_klass_compact(tmp2, obj2);
5458 cmpl(tmp1, tmp2);
5459 } else {
5460 movl(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes()));
5461 cmpl(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes()));
5462 }
5463 }
5464
5465 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5466 Register tmp1) {
5467 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5468 decorators = AccessInternal::decorator_fixup(decorators, type);
5469 bool as_raw = (decorators & AS_RAW) != 0;
5470 if (as_raw) {
5471 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1);
5472 } else {
5473 bs->load_at(this, decorators, type, dst, src, tmp1);
5474 }
5475 }
5476
5477 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5478 Register tmp1, Register tmp2, Register tmp3) {
5479 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5480 decorators = AccessInternal::decorator_fixup(decorators, type);
5481 bool as_raw = (decorators & AS_RAW) != 0;
5482 if (as_raw) {
5483 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5484 } else {
5485 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5486 }
5487 }
5488
5489 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5490 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1);
5491 }
5492
5493 // Doesn't do verification, generates fixed size code
5494 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, DecoratorSet decorators) {
5495 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1);
5496 }
5497
5498 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5499 Register tmp2, Register tmp3, DecoratorSet decorators) {
5500 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5501 }
5502
5503 // Used for storing nulls.
5504 void MacroAssembler::store_heap_oop_null(Address dst) {
5505 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5506 }
5507
5508 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5509 assert(!UseCompactObjectHeaders, "Don't use with compact headers");
5510 // Store to klass gap in destination
5511 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5512 }
5513
5514 #ifdef ASSERT
5515 void MacroAssembler::verify_heapbase(const char* msg) {
5516 assert (UseCompressedOops, "should be compressed");
5517 assert (Universe::heap() != nullptr, "java heap should be initialized");
5518 if (CheckCompressedOops) {
5519 Label ok;
5520 ExternalAddress src2(CompressedOops::base_addr());
5521 const bool is_src2_reachable = reachable(src2);
5522 if (!is_src2_reachable) {
5523 push(rscratch1); // cmpptr trashes rscratch1
5524 }
5525 cmpptr(r12_heapbase, src2, rscratch1);
5526 jcc(Assembler::equal, ok);
5527 STOP(msg);
5528 bind(ok);
5529 if (!is_src2_reachable) {
5530 pop(rscratch1);
5531 }
5532 }
5533 }
5534 #endif
5535
5536 // Algorithm must match oop.inline.hpp encode_heap_oop.
5537 void MacroAssembler::encode_heap_oop(Register r) {
5538 #ifdef ASSERT
5539 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5540 #endif
5541 verify_oop_msg(r, "broken oop in encode_heap_oop");
5542 if (CompressedOops::base() == nullptr) {
5543 if (CompressedOops::shift() != 0) {
5544 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5545 shrq(r, LogMinObjAlignmentInBytes);
5546 }
5547 return;
5548 }
5549 testq(r, r);
5550 cmovq(Assembler::equal, r, r12_heapbase);
5551 subq(r, r12_heapbase);
5552 shrq(r, LogMinObjAlignmentInBytes);
5553 }
5554
5555 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5556 #ifdef ASSERT
5557 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5558 if (CheckCompressedOops) {
5559 Label ok;
5560 testq(r, r);
5561 jcc(Assembler::notEqual, ok);
5562 STOP("null oop passed to encode_heap_oop_not_null");
5563 bind(ok);
5564 }
5565 #endif
5566 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5567 if (CompressedOops::base() != nullptr) {
5568 subq(r, r12_heapbase);
5569 }
5570 if (CompressedOops::shift() != 0) {
5571 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5572 shrq(r, LogMinObjAlignmentInBytes);
5573 }
5574 }
5575
5576 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5577 #ifdef ASSERT
5578 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5579 if (CheckCompressedOops) {
5580 Label ok;
5581 testq(src, src);
5582 jcc(Assembler::notEqual, ok);
5583 STOP("null oop passed to encode_heap_oop_not_null2");
5584 bind(ok);
5585 }
5586 #endif
5587 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5588 if (dst != src) {
5589 movq(dst, src);
5590 }
5591 if (CompressedOops::base() != nullptr) {
5592 subq(dst, r12_heapbase);
5593 }
5594 if (CompressedOops::shift() != 0) {
5595 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5596 shrq(dst, LogMinObjAlignmentInBytes);
5597 }
5598 }
5599
5600 void MacroAssembler::decode_heap_oop(Register r) {
5601 #ifdef ASSERT
5602 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5603 #endif
5604 if (CompressedOops::base() == nullptr) {
5605 if (CompressedOops::shift() != 0) {
5606 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5607 shlq(r, LogMinObjAlignmentInBytes);
5608 }
5609 } else {
5610 Label done;
5611 shlq(r, LogMinObjAlignmentInBytes);
5612 jccb(Assembler::equal, done);
5613 addq(r, r12_heapbase);
5614 bind(done);
5615 }
5616 verify_oop_msg(r, "broken oop in decode_heap_oop");
5617 }
5618
5619 void MacroAssembler::decode_heap_oop_not_null(Register r) {
5620 // Note: it will change flags
5621 assert (UseCompressedOops, "should only be used for compressed headers");
5622 assert (Universe::heap() != nullptr, "java heap should be initialized");
5623 // Cannot assert, unverified entry point counts instructions (see .ad file)
5624 // vtableStubs also counts instructions in pd_code_size_limit.
5625 // Also do not verify_oop as this is called by verify_oop.
5626 if (CompressedOops::shift() != 0) {
5627 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5628 shlq(r, LogMinObjAlignmentInBytes);
5629 if (CompressedOops::base() != nullptr) {
5630 addq(r, r12_heapbase);
5631 }
5632 } else {
5633 assert (CompressedOops::base() == nullptr, "sanity");
5634 }
5635 }
5636
5637 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5638 // Note: it will change flags
5639 assert (UseCompressedOops, "should only be used for compressed headers");
5640 assert (Universe::heap() != nullptr, "java heap should be initialized");
5641 // Cannot assert, unverified entry point counts instructions (see .ad file)
5642 // vtableStubs also counts instructions in pd_code_size_limit.
5643 // Also do not verify_oop as this is called by verify_oop.
5644 if (CompressedOops::shift() != 0) {
5645 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5646 if (LogMinObjAlignmentInBytes == Address::times_8) {
5647 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5648 } else {
5649 if (dst != src) {
5650 movq(dst, src);
5651 }
5652 shlq(dst, LogMinObjAlignmentInBytes);
5653 if (CompressedOops::base() != nullptr) {
5654 addq(dst, r12_heapbase);
5655 }
5656 }
5657 } else {
5658 assert (CompressedOops::base() == nullptr, "sanity");
5659 if (dst != src) {
5660 movq(dst, src);
5661 }
5662 }
5663 }
5664
5665 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5666 BLOCK_COMMENT("encode_klass_not_null {");
5667 assert_different_registers(r, tmp);
5668 if (CompressedKlassPointers::base() != nullptr) {
5669 if (AOTCodeCache::is_on_for_dump()) {
5670 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5671 } else {
5672 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5673 }
5674 subq(r, tmp);
5675 }
5676 if (CompressedKlassPointers::shift() != 0) {
5677 shrq(r, CompressedKlassPointers::shift());
5678 }
5679 BLOCK_COMMENT("} encode_klass_not_null");
5680 }
5681
5682 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5683 BLOCK_COMMENT("encode_and_move_klass_not_null {");
5684 assert_different_registers(src, dst);
5685 if (CompressedKlassPointers::base() != nullptr) {
5686 if (AOTCodeCache::is_on_for_dump()) {
5687 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5688 negq(dst);
5689 } else {
5690 movptr(dst, -(intptr_t)CompressedKlassPointers::base());
5691 }
5692 addq(dst, src);
5693 } else {
5694 movptr(dst, src);
5695 }
5696 if (CompressedKlassPointers::shift() != 0) {
5697 shrq(dst, CompressedKlassPointers::shift());
5698 }
5699 BLOCK_COMMENT("} encode_and_move_klass_not_null");
5700 }
5701
5702 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5703 BLOCK_COMMENT("decode_klass_not_null {");
5704 assert_different_registers(r, tmp);
5705 // Note: it will change flags
5706 // Cannot assert, unverified entry point counts instructions (see .ad file)
5707 // vtableStubs also counts instructions in pd_code_size_limit.
5708 // Also do not verify_oop as this is called by verify_oop.
5709 if (CompressedKlassPointers::shift() != 0) {
5710 shlq(r, CompressedKlassPointers::shift());
5711 }
5712 if (CompressedKlassPointers::base() != nullptr) {
5713 if (AOTCodeCache::is_on_for_dump()) {
5714 movptr(tmp, ExternalAddress(CompressedKlassPointers::base_addr()));
5715 } else {
5716 movptr(tmp, (intptr_t)CompressedKlassPointers::base());
5717 }
5718 addq(r, tmp);
5719 }
5720 BLOCK_COMMENT("} decode_klass_not_null");
5721 }
5722
5723 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5724 BLOCK_COMMENT("decode_and_move_klass_not_null {");
5725 assert_different_registers(src, dst);
5726 // Note: it will change flags
5727 // Cannot assert, unverified entry point counts instructions (see .ad file)
5728 // vtableStubs also counts instructions in pd_code_size_limit.
5729 // Also do not verify_oop as this is called by verify_oop.
5730
5731 if (CompressedKlassPointers::base() == nullptr &&
5732 CompressedKlassPointers::shift() == 0) {
5733 // The best case scenario is that there is no base or shift. Then it is already
5734 // a pointer that needs nothing but a register rename.
5735 movl(dst, src);
5736 } else {
5737 if (CompressedKlassPointers::shift() <= Address::times_8) {
5738 if (CompressedKlassPointers::base() != nullptr) {
5739 if (AOTCodeCache::is_on_for_dump()) {
5740 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5741 } else {
5742 movptr(dst, (intptr_t)CompressedKlassPointers::base());
5743 }
5744 } else {
5745 xorq(dst, dst);
5746 }
5747 if (CompressedKlassPointers::shift() != 0) {
5748 assert(CompressedKlassPointers::shift() == Address::times_8, "klass not aligned on 64bits?");
5749 leaq(dst, Address(dst, src, Address::times_8, 0));
5750 } else {
5751 addq(dst, src);
5752 }
5753 } else {
5754 if (CompressedKlassPointers::base() != nullptr) {
5755 if (AOTCodeCache::is_on_for_dump()) {
5756 movptr(dst, ExternalAddress(CompressedKlassPointers::base_addr()));
5757 shrq(dst, CompressedKlassPointers::shift());
5758 } else {
5759 const intptr_t base_right_shifted =
5760 (intptr_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
5761 movptr(dst, base_right_shifted);
5762 }
5763 } else {
5764 xorq(dst, dst);
5765 }
5766 addq(dst, src);
5767 shlq(dst, CompressedKlassPointers::shift());
5768 }
5769 }
5770 BLOCK_COMMENT("} decode_and_move_klass_not_null");
5771 }
5772
5773 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5774 assert (UseCompressedOops, "should only be used for compressed headers");
5775 assert (Universe::heap() != nullptr, "java heap should be initialized");
5776 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5777 int oop_index = oop_recorder()->find_index(obj);
5778 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5779 mov_narrow_oop(dst, oop_index, rspec);
5780 }
5781
5782 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5783 assert (UseCompressedOops, "should only be used for compressed headers");
5784 assert (Universe::heap() != nullptr, "java heap should be initialized");
5785 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5786 int oop_index = oop_recorder()->find_index(obj);
5787 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5788 mov_narrow_oop(dst, oop_index, rspec);
5789 }
5790
5791 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5792 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5793 int klass_index = oop_recorder()->find_index(k);
5794 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5795 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5796 }
5797
5798 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5799 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5800 int klass_index = oop_recorder()->find_index(k);
5801 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5802 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5803 }
5804
5805 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5806 assert (UseCompressedOops, "should only be used for compressed headers");
5807 assert (Universe::heap() != nullptr, "java heap should be initialized");
5808 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5809 int oop_index = oop_recorder()->find_index(obj);
5810 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5811 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5812 }
5813
5814 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5815 assert (UseCompressedOops, "should only be used for compressed headers");
5816 assert (Universe::heap() != nullptr, "java heap should be initialized");
5817 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5818 int oop_index = oop_recorder()->find_index(obj);
5819 RelocationHolder rspec = oop_Relocation::spec(oop_index);
5820 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5821 }
5822
5823 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5824 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5825 int klass_index = oop_recorder()->find_index(k);
5826 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5827 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5828 }
5829
5830 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5831 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5832 int klass_index = oop_recorder()->find_index(k);
5833 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5834 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5835 }
5836
5837 void MacroAssembler::reinit_heapbase() {
5838 if (UseCompressedOops) {
5839 if (Universe::heap() != nullptr && !AOTCodeCache::is_on_for_dump()) {
5840 if (CompressedOops::base() == nullptr) {
5841 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5842 } else {
5843 mov64(r12_heapbase, (int64_t)CompressedOops::base());
5844 }
5845 } else {
5846 movptr(r12_heapbase, ExternalAddress(CompressedOops::base_addr()));
5847 }
5848 }
5849 }
5850
5851 #if COMPILER2_OR_JVMCI
5852
5853 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5854 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5855 // cnt - number of qwords (8-byte words).
5856 // base - start address, qword aligned.
5857 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5858 bool use64byteVector = (MaxVectorSize == 64) && (CopyAVX3Threshold == 0);
5859 if (use64byteVector) {
5860 vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5861 } else if (MaxVectorSize >= 32) {
5862 vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5863 } else {
5864 pxor(xtmp, xtmp);
5865 }
5866 jmp(L_zero_64_bytes);
5867
5868 BIND(L_loop);
5869 if (MaxVectorSize >= 32) {
5870 fill64(base, 0, xtmp, use64byteVector);
5871 } else {
5872 movdqu(Address(base, 0), xtmp);
5873 movdqu(Address(base, 16), xtmp);
5874 movdqu(Address(base, 32), xtmp);
5875 movdqu(Address(base, 48), xtmp);
5876 }
5877 addptr(base, 64);
5878
5879 BIND(L_zero_64_bytes);
5880 subptr(cnt, 8);
5881 jccb(Assembler::greaterEqual, L_loop);
5882
5883 // Copy trailing 64 bytes
5884 if (use64byteVector) {
5885 addptr(cnt, 8);
5886 jccb(Assembler::equal, L_end);
5887 fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5888 jmp(L_end);
5889 } else {
5890 addptr(cnt, 4);
5891 jccb(Assembler::less, L_tail);
5892 if (MaxVectorSize >= 32) {
5893 vmovdqu(Address(base, 0), xtmp);
5894 } else {
5895 movdqu(Address(base, 0), xtmp);
5896 movdqu(Address(base, 16), xtmp);
5897 }
5898 }
5899 addptr(base, 32);
5900 subptr(cnt, 4);
5901
5902 BIND(L_tail);
5903 addptr(cnt, 4);
5904 jccb(Assembler::lessEqual, L_end);
5905 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5906 fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5907 } else {
5908 decrement(cnt);
5909
5910 BIND(L_sloop);
5911 movq(Address(base, 0), xtmp);
5912 addptr(base, 8);
5913 decrement(cnt);
5914 jccb(Assembler::greaterEqual, L_sloop);
5915 }
5916 BIND(L_end);
5917 }
5918
5919 // Clearing constant sized memory using YMM/ZMM registers.
5920 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5921 assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
5922 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
5923
5924 int vector64_count = (cnt & (~0x7)) >> 3;
5925 cnt = cnt & 0x7;
5926 const int fill64_per_loop = 4;
5927 const int max_unrolled_fill64 = 8;
5928
5929 // 64 byte initialization loop.
5930 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5931 int start64 = 0;
5932 if (vector64_count > max_unrolled_fill64) {
5933 Label LOOP;
5934 Register index = rtmp;
5935
5936 start64 = vector64_count - (vector64_count % fill64_per_loop);
5937
5938 movl(index, 0);
5939 BIND(LOOP);
5940 for (int i = 0; i < fill64_per_loop; i++) {
5941 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5942 }
5943 addl(index, fill64_per_loop * 64);
5944 cmpl(index, start64 * 64);
5945 jccb(Assembler::less, LOOP);
5946 }
5947 for (int i = start64; i < vector64_count; i++) {
5948 fill64(base, i * 64, xtmp, use64byteVector);
5949 }
5950
5951 // Clear remaining 64 byte tail.
5952 int disp = vector64_count * 64;
5953 if (cnt) {
5954 switch (cnt) {
5955 case 1:
5956 movq(Address(base, disp), xtmp);
5957 break;
5958 case 2:
5959 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
5960 break;
5961 case 3:
5962 movl(rtmp, 0x7);
5963 kmovwl(mask, rtmp);
5964 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
5965 break;
5966 case 4:
5967 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5968 break;
5969 case 5:
5970 if (use64byteVector) {
5971 movl(rtmp, 0x1F);
5972 kmovwl(mask, rtmp);
5973 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5974 } else {
5975 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5976 movq(Address(base, disp + 32), xtmp);
5977 }
5978 break;
5979 case 6:
5980 if (use64byteVector) {
5981 movl(rtmp, 0x3F);
5982 kmovwl(mask, rtmp);
5983 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5984 } else {
5985 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5986 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
5987 }
5988 break;
5989 case 7:
5990 if (use64byteVector) {
5991 movl(rtmp, 0x7F);
5992 kmovwl(mask, rtmp);
5993 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5994 } else {
5995 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5996 movl(rtmp, 0x7);
5997 kmovwl(mask, rtmp);
5998 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
5999 }
6000 break;
6001 default:
6002 fatal("Unexpected length : %d\n",cnt);
6003 break;
6004 }
6005 }
6006 }
6007
6008 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
6009 bool is_large, KRegister mask) {
6010 // cnt - number of qwords (8-byte words).
6011 // base - start address, qword aligned.
6012 // is_large - if optimizers know cnt is larger than InitArrayShortSize
6013 assert(base==rdi, "base register must be edi for rep stos");
6014 assert(tmp==rax, "tmp register must be eax for rep stos");
6015 assert(cnt==rcx, "cnt register must be ecx for rep stos");
6016 assert(InitArrayShortSize % BytesPerLong == 0,
6017 "InitArrayShortSize should be the multiple of BytesPerLong");
6018
6019 Label DONE;
6020 if (!is_large || !UseXMMForObjInit) {
6021 xorptr(tmp, tmp);
6022 }
6023
6024 if (!is_large) {
6025 Label LOOP, LONG;
6026 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6027 jccb(Assembler::greater, LONG);
6028
6029 decrement(cnt);
6030 jccb(Assembler::negative, DONE); // Zero length
6031
6032 // Use individual pointer-sized stores for small counts:
6033 BIND(LOOP);
6034 movptr(Address(base, cnt, Address::times_ptr), tmp);
6035 decrement(cnt);
6036 jccb(Assembler::greaterEqual, LOOP);
6037 jmpb(DONE);
6038
6039 BIND(LONG);
6040 }
6041
6042 // Use longer rep-prefixed ops for non-small counts:
6043 if (UseFastStosb) {
6044 shlptr(cnt, 3); // convert to number of bytes
6045 rep_stosb();
6046 } else if (UseXMMForObjInit) {
6047 xmm_clear_mem(base, cnt, tmp, xtmp, mask);
6048 } else {
6049 rep_stos();
6050 }
6051
6052 BIND(DONE);
6053 }
6054
6055 #endif //COMPILER2_OR_JVMCI
6056
6057
6058 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6059 Register to, Register value, Register count,
6060 Register rtmp, XMMRegister xtmp) {
6061 ShortBranchVerifier sbv(this);
6062 assert_different_registers(to, value, count, rtmp);
6063 Label L_exit;
6064 Label L_fill_2_bytes, L_fill_4_bytes;
6065
6066 #if defined(COMPILER2)
6067 if(MaxVectorSize >=32 &&
6068 VM_Version::supports_avx512vlbw() &&
6069 VM_Version::supports_bmi2()) {
6070 generate_fill_avx3(t, to, value, count, rtmp, xtmp);
6071 return;
6072 }
6073 #endif
6074
6075 int shift = -1;
6076 switch (t) {
6077 case T_BYTE:
6078 shift = 2;
6079 break;
6080 case T_SHORT:
6081 shift = 1;
6082 break;
6083 case T_INT:
6084 shift = 0;
6085 break;
6086 default: ShouldNotReachHere();
6087 }
6088
6089 if (t == T_BYTE) {
6090 andl(value, 0xff);
6091 movl(rtmp, value);
6092 shll(rtmp, 8);
6093 orl(value, rtmp);
6094 }
6095 if (t == T_SHORT) {
6096 andl(value, 0xffff);
6097 }
6098 if (t == T_BYTE || t == T_SHORT) {
6099 movl(rtmp, value);
6100 shll(rtmp, 16);
6101 orl(value, rtmp);
6102 }
6103
6104 cmpptr(count, 8 << shift); // Short arrays (< 32 bytes) fill by element
6105 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6106 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6107 Label L_skip_align2;
6108 // align source address at 4 bytes address boundary
6109 if (t == T_BYTE) {
6110 Label L_skip_align1;
6111 // One byte misalignment happens only for byte arrays
6112 testptr(to, 1);
6113 jccb(Assembler::zero, L_skip_align1);
6114 movb(Address(to, 0), value);
6115 increment(to);
6116 decrement(count);
6117 BIND(L_skip_align1);
6118 }
6119 // Two bytes misalignment happens only for byte and short (char) arrays
6120 testptr(to, 2);
6121 jccb(Assembler::zero, L_skip_align2);
6122 movw(Address(to, 0), value);
6123 addptr(to, 2);
6124 subptr(count, 1<<(shift-1));
6125 BIND(L_skip_align2);
6126 }
6127 {
6128 Label L_fill_32_bytes;
6129 if (!UseUnalignedLoadStores) {
6130 // align to 8 bytes, we know we are 4 byte aligned to start
6131 testptr(to, 4);
6132 jccb(Assembler::zero, L_fill_32_bytes);
6133 movl(Address(to, 0), value);
6134 addptr(to, 4);
6135 subptr(count, 1<<shift);
6136 }
6137 BIND(L_fill_32_bytes);
6138 {
6139 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6140 movdl(xtmp, value);
6141 if (UseAVX >= 2 && UseUnalignedLoadStores) {
6142 Label L_check_fill_32_bytes;
6143 if (UseAVX > 2) {
6144 // Fill 64-byte chunks
6145 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
6146
6147 // If number of bytes to fill < CopyAVX3Threshold, perform fill using AVX2
6148 cmpptr(count, CopyAVX3Threshold);
6149 jccb(Assembler::below, L_check_fill_64_bytes_avx2);
6150
6151 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
6152
6153 subptr(count, 16 << shift);
6154 jcc(Assembler::less, L_check_fill_32_bytes);
6155 align(16);
6156
6157 BIND(L_fill_64_bytes_loop_avx3);
6158 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
6159 addptr(to, 64);
6160 subptr(count, 16 << shift);
6161 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
6162 jmpb(L_check_fill_32_bytes);
6163
6164 BIND(L_check_fill_64_bytes_avx2);
6165 }
6166 // Fill 64-byte chunks
6167 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
6168
6169 subptr(count, 16 << shift);
6170 jcc(Assembler::less, L_check_fill_32_bytes);
6171
6172 // align data for 64-byte chunks
6173 Label L_fill_64_bytes_loop, L_align_64_bytes_loop;
6174 if (EnableX86ECoreOpts) {
6175 // align 'big' arrays to cache lines to minimize split_stores
6176 cmpptr(count, 96 << shift);
6177 jcc(Assembler::below, L_fill_64_bytes_loop);
6178
6179 // Find the bytes needed for alignment
6180 movptr(rtmp, to);
6181 andptr(rtmp, 0x1c);
6182 jcc(Assembler::zero, L_fill_64_bytes_loop);
6183 negptr(rtmp); // number of bytes to fill 32-rtmp. it filled by 2 mov by 32
6184 addptr(rtmp, 32);
6185 shrptr(rtmp, 2 - shift);// get number of elements from bytes
6186 subptr(count, rtmp); // adjust count by number of elements
6187
6188 align(16);
6189 BIND(L_align_64_bytes_loop);
6190 movdl(Address(to, 0), xtmp);
6191 addptr(to, 4);
6192 subptr(rtmp, 1 << shift);
6193 jcc(Assembler::greater, L_align_64_bytes_loop);
6194 }
6195
6196 align(16);
6197 BIND(L_fill_64_bytes_loop);
6198 vmovdqu(Address(to, 0), xtmp);
6199 vmovdqu(Address(to, 32), xtmp);
6200 addptr(to, 64);
6201 subptr(count, 16 << shift);
6202 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6203
6204 align(16);
6205 BIND(L_check_fill_32_bytes);
6206 addptr(count, 8 << shift);
6207 jccb(Assembler::less, L_check_fill_8_bytes);
6208 vmovdqu(Address(to, 0), xtmp);
6209 addptr(to, 32);
6210 subptr(count, 8 << shift);
6211
6212 BIND(L_check_fill_8_bytes);
6213 // clean upper bits of YMM registers
6214 movdl(xtmp, value);
6215 pshufd(xtmp, xtmp, 0);
6216 } else {
6217 // Fill 32-byte chunks
6218 pshufd(xtmp, xtmp, 0);
6219
6220 subptr(count, 8 << shift);
6221 jcc(Assembler::less, L_check_fill_8_bytes);
6222 align(16);
6223
6224 BIND(L_fill_32_bytes_loop);
6225
6226 if (UseUnalignedLoadStores) {
6227 movdqu(Address(to, 0), xtmp);
6228 movdqu(Address(to, 16), xtmp);
6229 } else {
6230 movq(Address(to, 0), xtmp);
6231 movq(Address(to, 8), xtmp);
6232 movq(Address(to, 16), xtmp);
6233 movq(Address(to, 24), xtmp);
6234 }
6235
6236 addptr(to, 32);
6237 subptr(count, 8 << shift);
6238 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6239
6240 BIND(L_check_fill_8_bytes);
6241 }
6242 addptr(count, 8 << shift);
6243 jccb(Assembler::zero, L_exit);
6244 jmpb(L_fill_8_bytes);
6245
6246 //
6247 // length is too short, just fill qwords
6248 //
6249 align(16);
6250 BIND(L_fill_8_bytes_loop);
6251 movq(Address(to, 0), xtmp);
6252 addptr(to, 8);
6253 BIND(L_fill_8_bytes);
6254 subptr(count, 1 << (shift + 1));
6255 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6256 }
6257 }
6258
6259 Label L_fill_4_bytes_loop;
6260 testl(count, 1 << shift);
6261 jccb(Assembler::zero, L_fill_2_bytes);
6262
6263 align(16);
6264 BIND(L_fill_4_bytes_loop);
6265 movl(Address(to, 0), value);
6266 addptr(to, 4);
6267
6268 BIND(L_fill_4_bytes);
6269 subptr(count, 1 << shift);
6270 jccb(Assembler::greaterEqual, L_fill_4_bytes_loop);
6271
6272 if (t == T_BYTE || t == T_SHORT) {
6273 Label L_fill_byte;
6274 BIND(L_fill_2_bytes);
6275 // fill trailing 2 bytes
6276 testl(count, 1<<(shift-1));
6277 jccb(Assembler::zero, L_fill_byte);
6278 movw(Address(to, 0), value);
6279 if (t == T_BYTE) {
6280 addptr(to, 2);
6281 BIND(L_fill_byte);
6282 // fill trailing byte
6283 testl(count, 1);
6284 jccb(Assembler::zero, L_exit);
6285 movb(Address(to, 0), value);
6286 } else {
6287 BIND(L_fill_byte);
6288 }
6289 } else {
6290 BIND(L_fill_2_bytes);
6291 }
6292 BIND(L_exit);
6293 }
6294
6295 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
6296 switch(type) {
6297 case T_BYTE:
6298 case T_BOOLEAN:
6299 evpbroadcastb(dst, src, vector_len);
6300 break;
6301 case T_SHORT:
6302 case T_CHAR:
6303 evpbroadcastw(dst, src, vector_len);
6304 break;
6305 case T_INT:
6306 case T_FLOAT:
6307 evpbroadcastd(dst, src, vector_len);
6308 break;
6309 case T_LONG:
6310 case T_DOUBLE:
6311 evpbroadcastq(dst, src, vector_len);
6312 break;
6313 default:
6314 fatal("Unhandled type : %s", type2name(type));
6315 break;
6316 }
6317 }
6318
6319 // Encode given char[]/byte[] to byte[] in ISO_8859_1 or ASCII
6320 //
6321 // @IntrinsicCandidate
6322 // int sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(
6323 // char[] sa, int sp, byte[] da, int dp, int len) {
6324 // int i = 0;
6325 // for (; i < len; i++) {
6326 // char c = sa[sp++];
6327 // if (c > '\u00FF')
6328 // break;
6329 // da[dp++] = (byte) c;
6330 // }
6331 // return i;
6332 // }
6333 //
6334 // @IntrinsicCandidate
6335 // int java.lang.StringCoding.encodeISOArray0(
6336 // byte[] sa, int sp, byte[] da, int dp, int len) {
6337 // int i = 0;
6338 // for (; i < len; i++) {
6339 // char c = StringUTF16.getChar(sa, sp++);
6340 // if (c > '\u00FF')
6341 // break;
6342 // da[dp++] = (byte) c;
6343 // }
6344 // return i;
6345 // }
6346 //
6347 // @IntrinsicCandidate
6348 // int java.lang.StringCoding.encodeAsciiArray0(
6349 // char[] sa, int sp, byte[] da, int dp, int len) {
6350 // int i = 0;
6351 // for (; i < len; i++) {
6352 // char c = sa[sp++];
6353 // if (c >= '\u0080')
6354 // break;
6355 // da[dp++] = (byte) c;
6356 // }
6357 // return i;
6358 // }
6359 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6360 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6361 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6362 Register tmp5, Register result, bool ascii) {
6363
6364 // rsi: src
6365 // rdi: dst
6366 // rdx: len
6367 // rcx: tmp5
6368 // rax: result
6369 ShortBranchVerifier sbv(this);
6370 assert_different_registers(src, dst, len, tmp5, result);
6371 Label L_done, L_copy_1_char, L_copy_1_char_exit;
6372
6373 int mask = ascii ? 0xff80ff80 : 0xff00ff00;
6374 int short_mask = ascii ? 0xff80 : 0xff00;
6375
6376 // set result
6377 xorl(result, result);
6378 // check for zero length
6379 testl(len, len);
6380 jcc(Assembler::zero, L_done);
6381
6382 movl(result, len);
6383
6384 // Setup pointers
6385 lea(src, Address(src, len, Address::times_2)); // char[]
6386 lea(dst, Address(dst, len, Address::times_1)); // byte[]
6387 negptr(len);
6388
6389 if (UseSSE42Intrinsics || UseAVX >= 2) {
6390 Label L_copy_8_chars, L_copy_8_chars_exit;
6391 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6392
6393 if (UseAVX >= 2) {
6394 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6395 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6396 movdl(tmp1Reg, tmp5);
6397 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
6398 jmp(L_chars_32_check);
6399
6400 bind(L_copy_32_chars);
6401 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6402 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6403 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6404 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6405 jccb(Assembler::notZero, L_copy_32_chars_exit);
6406 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6407 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
6408 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6409
6410 bind(L_chars_32_check);
6411 addptr(len, 32);
6412 jcc(Assembler::lessEqual, L_copy_32_chars);
6413
6414 bind(L_copy_32_chars_exit);
6415 subptr(len, 16);
6416 jccb(Assembler::greater, L_copy_16_chars_exit);
6417
6418 } else if (UseSSE42Intrinsics) {
6419 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6420 movdl(tmp1Reg, tmp5);
6421 pshufd(tmp1Reg, tmp1Reg, 0);
6422 jmpb(L_chars_16_check);
6423 }
6424
6425 bind(L_copy_16_chars);
6426 if (UseAVX >= 2) {
6427 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6428 vptest(tmp2Reg, tmp1Reg);
6429 jcc(Assembler::notZero, L_copy_16_chars_exit);
6430 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
6431 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
6432 } else {
6433 if (UseAVX > 0) {
6434 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6435 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6436 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
6437 } else {
6438 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6439 por(tmp2Reg, tmp3Reg);
6440 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6441 por(tmp2Reg, tmp4Reg);
6442 }
6443 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6444 jccb(Assembler::notZero, L_copy_16_chars_exit);
6445 packuswb(tmp3Reg, tmp4Reg);
6446 }
6447 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6448
6449 bind(L_chars_16_check);
6450 addptr(len, 16);
6451 jcc(Assembler::lessEqual, L_copy_16_chars);
6452
6453 bind(L_copy_16_chars_exit);
6454 if (UseAVX >= 2) {
6455 // clean upper bits of YMM registers
6456 vpxor(tmp2Reg, tmp2Reg);
6457 vpxor(tmp3Reg, tmp3Reg);
6458 vpxor(tmp4Reg, tmp4Reg);
6459 movdl(tmp1Reg, tmp5);
6460 pshufd(tmp1Reg, tmp1Reg, 0);
6461 }
6462 subptr(len, 8);
6463 jccb(Assembler::greater, L_copy_8_chars_exit);
6464
6465 bind(L_copy_8_chars);
6466 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6467 ptest(tmp3Reg, tmp1Reg);
6468 jccb(Assembler::notZero, L_copy_8_chars_exit);
6469 packuswb(tmp3Reg, tmp1Reg);
6470 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6471 addptr(len, 8);
6472 jccb(Assembler::lessEqual, L_copy_8_chars);
6473
6474 bind(L_copy_8_chars_exit);
6475 subptr(len, 8);
6476 jccb(Assembler::zero, L_done);
6477 }
6478
6479 bind(L_copy_1_char);
6480 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6481 testl(tmp5, short_mask); // check if Unicode or non-ASCII char
6482 jccb(Assembler::notZero, L_copy_1_char_exit);
6483 movb(Address(dst, len, Address::times_1, 0), tmp5);
6484 addptr(len, 1);
6485 jccb(Assembler::less, L_copy_1_char);
6486
6487 bind(L_copy_1_char_exit);
6488 addptr(result, len); // len is negative count of not processed elements
6489
6490 bind(L_done);
6491 }
6492
6493 /**
6494 * Helper for multiply_to_len().
6495 */
6496 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6497 addq(dest_lo, src1);
6498 adcq(dest_hi, 0);
6499 addq(dest_lo, src2);
6500 adcq(dest_hi, 0);
6501 }
6502
6503 /**
6504 * Multiply 64 bit by 64 bit first loop.
6505 */
6506 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6507 Register y, Register y_idx, Register z,
6508 Register carry, Register product,
6509 Register idx, Register kdx) {
6510 //
6511 // jlong carry, x[], y[], z[];
6512 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6513 // huge_128 product = y[idx] * x[xstart] + carry;
6514 // z[kdx] = (jlong)product;
6515 // carry = (jlong)(product >>> 64);
6516 // }
6517 // z[xstart] = carry;
6518 //
6519
6520 Label L_first_loop, L_first_loop_exit;
6521 Label L_one_x, L_one_y, L_multiply;
6522
6523 decrementl(xstart);
6524 jcc(Assembler::negative, L_one_x);
6525
6526 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6527 rorq(x_xstart, 32); // convert big-endian to little-endian
6528
6529 bind(L_first_loop);
6530 decrementl(idx);
6531 jcc(Assembler::negative, L_first_loop_exit);
6532 decrementl(idx);
6533 jcc(Assembler::negative, L_one_y);
6534 movq(y_idx, Address(y, idx, Address::times_4, 0));
6535 rorq(y_idx, 32); // convert big-endian to little-endian
6536 bind(L_multiply);
6537 movq(product, x_xstart);
6538 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
6539 addq(product, carry);
6540 adcq(rdx, 0);
6541 subl(kdx, 2);
6542 movl(Address(z, kdx, Address::times_4, 4), product);
6543 shrq(product, 32);
6544 movl(Address(z, kdx, Address::times_4, 0), product);
6545 movq(carry, rdx);
6546 jmp(L_first_loop);
6547
6548 bind(L_one_y);
6549 movl(y_idx, Address(y, 0));
6550 jmp(L_multiply);
6551
6552 bind(L_one_x);
6553 movl(x_xstart, Address(x, 0));
6554 jmp(L_first_loop);
6555
6556 bind(L_first_loop_exit);
6557 }
6558
6559 /**
6560 * Multiply 64 bit by 64 bit and add 128 bit.
6561 */
6562 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
6563 Register yz_idx, Register idx,
6564 Register carry, Register product, int offset) {
6565 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
6566 // z[kdx] = (jlong)product;
6567
6568 movq(yz_idx, Address(y, idx, Address::times_4, offset));
6569 rorq(yz_idx, 32); // convert big-endian to little-endian
6570 movq(product, x_xstart);
6571 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6572 movq(yz_idx, Address(z, idx, Address::times_4, offset));
6573 rorq(yz_idx, 32); // convert big-endian to little-endian
6574
6575 add2_with_carry(rdx, product, carry, yz_idx);
6576
6577 movl(Address(z, idx, Address::times_4, offset+4), product);
6578 shrq(product, 32);
6579 movl(Address(z, idx, Address::times_4, offset), product);
6580
6581 }
6582
6583 /**
6584 * Multiply 128 bit by 128 bit. Unrolled inner loop.
6585 */
6586 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
6587 Register yz_idx, Register idx, Register jdx,
6588 Register carry, Register product,
6589 Register carry2) {
6590 // jlong carry, x[], y[], z[];
6591 // int kdx = ystart+1;
6592 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6593 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
6594 // z[kdx+idx+1] = (jlong)product;
6595 // jlong carry2 = (jlong)(product >>> 64);
6596 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6597 // z[kdx+idx] = (jlong)product;
6598 // carry = (jlong)(product >>> 64);
6599 // }
6600 // idx += 2;
6601 // if (idx > 0) {
6602 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6603 // z[kdx+idx] = (jlong)product;
6604 // carry = (jlong)(product >>> 64);
6605 // }
6606 //
6607
6608 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6609
6610 movl(jdx, idx);
6611 andl(jdx, 0xFFFFFFFC);
6612 shrl(jdx, 2);
6613
6614 bind(L_third_loop);
6615 subl(jdx, 1);
6616 jcc(Assembler::negative, L_third_loop_exit);
6617 subl(idx, 4);
6618
6619 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6620 movq(carry2, rdx);
6621
6622 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6623 movq(carry, rdx);
6624 jmp(L_third_loop);
6625
6626 bind (L_third_loop_exit);
6627
6628 andl (idx, 0x3);
6629 jcc(Assembler::zero, L_post_third_loop_done);
6630
6631 Label L_check_1;
6632 subl(idx, 2);
6633 jcc(Assembler::negative, L_check_1);
6634
6635 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6636 movq(carry, rdx);
6637
6638 bind (L_check_1);
6639 addl (idx, 0x2);
6640 andl (idx, 0x1);
6641 subl(idx, 1);
6642 jcc(Assembler::negative, L_post_third_loop_done);
6643
6644 movl(yz_idx, Address(y, idx, Address::times_4, 0));
6645 movq(product, x_xstart);
6646 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6647 movl(yz_idx, Address(z, idx, Address::times_4, 0));
6648
6649 add2_with_carry(rdx, product, yz_idx, carry);
6650
6651 movl(Address(z, idx, Address::times_4, 0), product);
6652 shrq(product, 32);
6653
6654 shlq(rdx, 32);
6655 orq(product, rdx);
6656 movq(carry, product);
6657
6658 bind(L_post_third_loop_done);
6659 }
6660
6661 /**
6662 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6663 *
6664 */
6665 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6666 Register carry, Register carry2,
6667 Register idx, Register jdx,
6668 Register yz_idx1, Register yz_idx2,
6669 Register tmp, Register tmp3, Register tmp4) {
6670 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6671
6672 // jlong carry, x[], y[], z[];
6673 // int kdx = ystart+1;
6674 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6675 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6676 // jlong carry2 = (jlong)(tmp3 >>> 64);
6677 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
6678 // carry = (jlong)(tmp4 >>> 64);
6679 // z[kdx+idx+1] = (jlong)tmp3;
6680 // z[kdx+idx] = (jlong)tmp4;
6681 // }
6682 // idx += 2;
6683 // if (idx > 0) {
6684 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6685 // z[kdx+idx] = (jlong)yz_idx1;
6686 // carry = (jlong)(yz_idx1 >>> 64);
6687 // }
6688 //
6689
6690 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6691
6692 movl(jdx, idx);
6693 andl(jdx, 0xFFFFFFFC);
6694 shrl(jdx, 2);
6695
6696 bind(L_third_loop);
6697 subl(jdx, 1);
6698 jcc(Assembler::negative, L_third_loop_exit);
6699 subl(idx, 4);
6700
6701 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
6702 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6703 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
6704 rorxq(yz_idx2, yz_idx2, 32);
6705
6706 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6707 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
6708
6709 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
6710 rorxq(yz_idx1, yz_idx1, 32);
6711 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6712 rorxq(yz_idx2, yz_idx2, 32);
6713
6714 if (VM_Version::supports_adx()) {
6715 adcxq(tmp3, carry);
6716 adoxq(tmp3, yz_idx1);
6717
6718 adcxq(tmp4, tmp);
6719 adoxq(tmp4, yz_idx2);
6720
6721 movl(carry, 0); // does not affect flags
6722 adcxq(carry2, carry);
6723 adoxq(carry2, carry);
6724 } else {
6725 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6726 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6727 }
6728 movq(carry, carry2);
6729
6730 movl(Address(z, idx, Address::times_4, 12), tmp3);
6731 shrq(tmp3, 32);
6732 movl(Address(z, idx, Address::times_4, 8), tmp3);
6733
6734 movl(Address(z, idx, Address::times_4, 4), tmp4);
6735 shrq(tmp4, 32);
6736 movl(Address(z, idx, Address::times_4, 0), tmp4);
6737
6738 jmp(L_third_loop);
6739
6740 bind (L_third_loop_exit);
6741
6742 andl (idx, 0x3);
6743 jcc(Assembler::zero, L_post_third_loop_done);
6744
6745 Label L_check_1;
6746 subl(idx, 2);
6747 jcc(Assembler::negative, L_check_1);
6748
6749 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
6750 rorxq(yz_idx1, yz_idx1, 32);
6751 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6752 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6753 rorxq(yz_idx2, yz_idx2, 32);
6754
6755 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6756
6757 movl(Address(z, idx, Address::times_4, 4), tmp3);
6758 shrq(tmp3, 32);
6759 movl(Address(z, idx, Address::times_4, 0), tmp3);
6760 movq(carry, tmp4);
6761
6762 bind (L_check_1);
6763 addl (idx, 0x2);
6764 andl (idx, 0x1);
6765 subl(idx, 1);
6766 jcc(Assembler::negative, L_post_third_loop_done);
6767 movl(tmp4, Address(y, idx, Address::times_4, 0));
6768 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
6769 movl(tmp4, Address(z, idx, Address::times_4, 0));
6770
6771 add2_with_carry(carry2, tmp3, tmp4, carry);
6772
6773 movl(Address(z, idx, Address::times_4, 0), tmp3);
6774 shrq(tmp3, 32);
6775
6776 shlq(carry2, 32);
6777 orq(tmp3, carry2);
6778 movq(carry, tmp3);
6779
6780 bind(L_post_third_loop_done);
6781 }
6782
6783 /**
6784 * Code for BigInteger::multiplyToLen() intrinsic.
6785 *
6786 * rdi: x
6787 * rax: xlen
6788 * rsi: y
6789 * rcx: ylen
6790 * r8: z
6791 * r11: tmp0
6792 * r12: tmp1
6793 * r13: tmp2
6794 * r14: tmp3
6795 * r15: tmp4
6796 * rbx: tmp5
6797 *
6798 */
6799 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
6800 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6801 ShortBranchVerifier sbv(this);
6802 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6803
6804 push(tmp0);
6805 push(tmp1);
6806 push(tmp2);
6807 push(tmp3);
6808 push(tmp4);
6809 push(tmp5);
6810
6811 push(xlen);
6812
6813 const Register idx = tmp1;
6814 const Register kdx = tmp2;
6815 const Register xstart = tmp3;
6816
6817 const Register y_idx = tmp4;
6818 const Register carry = tmp5;
6819 const Register product = xlen;
6820 const Register x_xstart = tmp0;
6821
6822 // First Loop.
6823 //
6824 // final static long LONG_MASK = 0xffffffffL;
6825 // int xstart = xlen - 1;
6826 // int ystart = ylen - 1;
6827 // long carry = 0;
6828 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6829 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6830 // z[kdx] = (int)product;
6831 // carry = product >>> 32;
6832 // }
6833 // z[xstart] = (int)carry;
6834 //
6835
6836 movl(idx, ylen); // idx = ylen;
6837 lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen;
6838 xorq(carry, carry); // carry = 0;
6839
6840 Label L_done;
6841
6842 movl(xstart, xlen);
6843 decrementl(xstart);
6844 jcc(Assembler::negative, L_done);
6845
6846 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6847
6848 Label L_second_loop;
6849 testl(kdx, kdx);
6850 jcc(Assembler::zero, L_second_loop);
6851
6852 Label L_carry;
6853 subl(kdx, 1);
6854 jcc(Assembler::zero, L_carry);
6855
6856 movl(Address(z, kdx, Address::times_4, 0), carry);
6857 shrq(carry, 32);
6858 subl(kdx, 1);
6859
6860 bind(L_carry);
6861 movl(Address(z, kdx, Address::times_4, 0), carry);
6862
6863 // Second and third (nested) loops.
6864 //
6865 // for (int i = xstart-1; i >= 0; i--) { // Second loop
6866 // carry = 0;
6867 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6868 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6869 // (z[k] & LONG_MASK) + carry;
6870 // z[k] = (int)product;
6871 // carry = product >>> 32;
6872 // }
6873 // z[i] = (int)carry;
6874 // }
6875 //
6876 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6877
6878 const Register jdx = tmp1;
6879
6880 bind(L_second_loop);
6881 xorl(carry, carry); // carry = 0;
6882 movl(jdx, ylen); // j = ystart+1
6883
6884 subl(xstart, 1); // i = xstart-1;
6885 jcc(Assembler::negative, L_done);
6886
6887 push (z);
6888
6889 Label L_last_x;
6890 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6891 subl(xstart, 1); // i = xstart-1;
6892 jcc(Assembler::negative, L_last_x);
6893
6894 if (UseBMI2Instructions) {
6895 movq(rdx, Address(x, xstart, Address::times_4, 0));
6896 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6897 } else {
6898 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6899 rorq(x_xstart, 32); // convert big-endian to little-endian
6900 }
6901
6902 Label L_third_loop_prologue;
6903 bind(L_third_loop_prologue);
6904
6905 push (x);
6906 push (xstart);
6907 push (ylen);
6908
6909
6910 if (UseBMI2Instructions) {
6911 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6912 } else { // !UseBMI2Instructions
6913 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6914 }
6915
6916 pop(ylen);
6917 pop(xlen);
6918 pop(x);
6919 pop(z);
6920
6921 movl(tmp3, xlen);
6922 addl(tmp3, 1);
6923 movl(Address(z, tmp3, Address::times_4, 0), carry);
6924 subl(tmp3, 1);
6925 jccb(Assembler::negative, L_done);
6926
6927 shrq(carry, 32);
6928 movl(Address(z, tmp3, Address::times_4, 0), carry);
6929 jmp(L_second_loop);
6930
6931 // Next infrequent code is moved outside loops.
6932 bind(L_last_x);
6933 if (UseBMI2Instructions) {
6934 movl(rdx, Address(x, 0));
6935 } else {
6936 movl(x_xstart, Address(x, 0));
6937 }
6938 jmp(L_third_loop_prologue);
6939
6940 bind(L_done);
6941
6942 pop(xlen);
6943
6944 pop(tmp5);
6945 pop(tmp4);
6946 pop(tmp3);
6947 pop(tmp2);
6948 pop(tmp1);
6949 pop(tmp0);
6950 }
6951
6952 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6953 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6954 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6955 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6956 Label VECTOR8_TAIL, VECTOR4_TAIL;
6957 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6958 Label SAME_TILL_END, DONE;
6959 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6960
6961 //scale is in rcx in both Win64 and Unix
6962 ShortBranchVerifier sbv(this);
6963
6964 shlq(length);
6965 xorq(result, result);
6966
6967 if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6968 VM_Version::supports_avx512vlbw() && UseCountTrailingZerosInstruction) {
6969 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6970
6971 cmpq(length, 64);
6972 jcc(Assembler::less, VECTOR32_TAIL);
6973
6974 movq(tmp1, length);
6975 andq(tmp1, 0x3F); // tail count
6976 andq(length, ~(0x3F)); //vector count
6977
6978 bind(VECTOR64_LOOP);
6979 // AVX512 code to compare 64 byte vectors.
6980 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
6981 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6982 kortestql(k7, k7);
6983 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
6984 addq(result, 64);
6985 subq(length, 64);
6986 jccb(Assembler::notZero, VECTOR64_LOOP);
6987
6988 //bind(VECTOR64_TAIL);
6989 testq(tmp1, tmp1);
6990 jcc(Assembler::zero, SAME_TILL_END);
6991
6992 //bind(VECTOR64_TAIL);
6993 // AVX512 code to compare up to 63 byte vectors.
6994 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6995 shlxq(tmp2, tmp2, tmp1);
6996 notq(tmp2);
6997 kmovql(k3, tmp2);
6998
6999 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
7000 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
7001
7002 ktestql(k7, k3);
7003 jcc(Assembler::below, SAME_TILL_END); // not mismatch
7004
7005 bind(VECTOR64_NOT_EQUAL);
7006 kmovql(tmp1, k7);
7007 notq(tmp1);
7008 tzcntq(tmp1, tmp1);
7009 addq(result, tmp1);
7010 shrq(result);
7011 jmp(DONE);
7012 bind(VECTOR32_TAIL);
7013 }
7014
7015 cmpq(length, 8);
7016 jcc(Assembler::equal, VECTOR8_LOOP);
7017 jcc(Assembler::less, VECTOR4_TAIL);
7018
7019 if (UseAVX >= 2) {
7020 Label VECTOR16_TAIL, VECTOR32_LOOP;
7021
7022 cmpq(length, 16);
7023 jcc(Assembler::equal, VECTOR16_LOOP);
7024 jcc(Assembler::less, VECTOR8_LOOP);
7025
7026 cmpq(length, 32);
7027 jccb(Assembler::less, VECTOR16_TAIL);
7028
7029 subq(length, 32);
7030 bind(VECTOR32_LOOP);
7031 vmovdqu(rymm0, Address(obja, result));
7032 vmovdqu(rymm1, Address(objb, result));
7033 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
7034 vptest(rymm2, rymm2);
7035 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
7036 addq(result, 32);
7037 subq(length, 32);
7038 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
7039 addq(length, 32);
7040 jcc(Assembler::equal, SAME_TILL_END);
7041 //falling through if less than 32 bytes left //close the branch here.
7042
7043 bind(VECTOR16_TAIL);
7044 cmpq(length, 16);
7045 jccb(Assembler::less, VECTOR8_TAIL);
7046 bind(VECTOR16_LOOP);
7047 movdqu(rymm0, Address(obja, result));
7048 movdqu(rymm1, Address(objb, result));
7049 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
7050 ptest(rymm2, rymm2);
7051 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7052 addq(result, 16);
7053 subq(length, 16);
7054 jcc(Assembler::equal, SAME_TILL_END);
7055 //falling through if less than 16 bytes left
7056 } else {//regular intrinsics
7057
7058 cmpq(length, 16);
7059 jccb(Assembler::less, VECTOR8_TAIL);
7060
7061 subq(length, 16);
7062 bind(VECTOR16_LOOP);
7063 movdqu(rymm0, Address(obja, result));
7064 movdqu(rymm1, Address(objb, result));
7065 pxor(rymm0, rymm1);
7066 ptest(rymm0, rymm0);
7067 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7068 addq(result, 16);
7069 subq(length, 16);
7070 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
7071 addq(length, 16);
7072 jcc(Assembler::equal, SAME_TILL_END);
7073 //falling through if less than 16 bytes left
7074 }
7075
7076 bind(VECTOR8_TAIL);
7077 cmpq(length, 8);
7078 jccb(Assembler::less, VECTOR4_TAIL);
7079 bind(VECTOR8_LOOP);
7080 movq(tmp1, Address(obja, result));
7081 movq(tmp2, Address(objb, result));
7082 xorq(tmp1, tmp2);
7083 testq(tmp1, tmp1);
7084 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
7085 addq(result, 8);
7086 subq(length, 8);
7087 jcc(Assembler::equal, SAME_TILL_END);
7088 //falling through if less than 8 bytes left
7089
7090 bind(VECTOR4_TAIL);
7091 cmpq(length, 4);
7092 jccb(Assembler::less, BYTES_TAIL);
7093 bind(VECTOR4_LOOP);
7094 movl(tmp1, Address(obja, result));
7095 xorl(tmp1, Address(objb, result));
7096 testl(tmp1, tmp1);
7097 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
7098 addq(result, 4);
7099 subq(length, 4);
7100 jcc(Assembler::equal, SAME_TILL_END);
7101 //falling through if less than 4 bytes left
7102
7103 bind(BYTES_TAIL);
7104 bind(BYTES_LOOP);
7105 load_unsigned_byte(tmp1, Address(obja, result));
7106 load_unsigned_byte(tmp2, Address(objb, result));
7107 xorl(tmp1, tmp2);
7108 testl(tmp1, tmp1);
7109 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7110 decq(length);
7111 jcc(Assembler::zero, SAME_TILL_END);
7112 incq(result);
7113 load_unsigned_byte(tmp1, Address(obja, result));
7114 load_unsigned_byte(tmp2, Address(objb, result));
7115 xorl(tmp1, tmp2);
7116 testl(tmp1, tmp1);
7117 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7118 decq(length);
7119 jcc(Assembler::zero, SAME_TILL_END);
7120 incq(result);
7121 load_unsigned_byte(tmp1, Address(obja, result));
7122 load_unsigned_byte(tmp2, Address(objb, result));
7123 xorl(tmp1, tmp2);
7124 testl(tmp1, tmp1);
7125 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7126 jmp(SAME_TILL_END);
7127
7128 if (UseAVX >= 2) {
7129 bind(VECTOR32_NOT_EQUAL);
7130 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
7131 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
7132 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
7133 vpmovmskb(tmp1, rymm0);
7134 bsfq(tmp1, tmp1);
7135 addq(result, tmp1);
7136 shrq(result);
7137 jmp(DONE);
7138 }
7139
7140 bind(VECTOR16_NOT_EQUAL);
7141 if (UseAVX >= 2) {
7142 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
7143 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
7144 pxor(rymm0, rymm2);
7145 } else {
7146 pcmpeqb(rymm2, rymm2);
7147 pxor(rymm0, rymm1);
7148 pcmpeqb(rymm0, rymm1);
7149 pxor(rymm0, rymm2);
7150 }
7151 pmovmskb(tmp1, rymm0);
7152 bsfq(tmp1, tmp1);
7153 addq(result, tmp1);
7154 shrq(result);
7155 jmpb(DONE);
7156
7157 bind(VECTOR8_NOT_EQUAL);
7158 bind(VECTOR4_NOT_EQUAL);
7159 bsfq(tmp1, tmp1);
7160 shrq(tmp1, 3);
7161 addq(result, tmp1);
7162 bind(BYTES_NOT_EQUAL);
7163 shrq(result);
7164 jmpb(DONE);
7165
7166 bind(SAME_TILL_END);
7167 mov64(result, -1);
7168
7169 bind(DONE);
7170 }
7171
7172 //Helper functions for square_to_len()
7173
7174 /**
7175 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7176 * Preserves x and z and modifies rest of the registers.
7177 */
7178 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7179 // Perform square and right shift by 1
7180 // Handle odd xlen case first, then for even xlen do the following
7181 // jlong carry = 0;
7182 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7183 // huge_128 product = x[j:j+1] * x[j:j+1];
7184 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7185 // z[i+2:i+3] = (jlong)(product >>> 1);
7186 // carry = (jlong)product;
7187 // }
7188
7189 xorq(tmp5, tmp5); // carry
7190 xorq(rdxReg, rdxReg);
7191 xorl(tmp1, tmp1); // index for x
7192 xorl(tmp4, tmp4); // index for z
7193
7194 Label L_first_loop, L_first_loop_exit;
7195
7196 testl(xlen, 1);
7197 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7198
7199 // Square and right shift by 1 the odd element using 32 bit multiply
7200 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7201 imulq(raxReg, raxReg);
7202 shrq(raxReg, 1);
7203 adcq(tmp5, 0);
7204 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7205 incrementl(tmp1);
7206 addl(tmp4, 2);
7207
7208 // Square and right shift by 1 the rest using 64 bit multiply
7209 bind(L_first_loop);
7210 cmpptr(tmp1, xlen);
7211 jccb(Assembler::equal, L_first_loop_exit);
7212
7213 // Square
7214 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
7215 rorq(raxReg, 32); // convert big-endian to little-endian
7216 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
7217
7218 // Right shift by 1 and save carry
7219 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7220 rcrq(rdxReg, 1);
7221 rcrq(raxReg, 1);
7222 adcq(tmp5, 0);
7223
7224 // Store result in z
7225 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7226 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7227
7228 // Update indices for x and z
7229 addl(tmp1, 2);
7230 addl(tmp4, 4);
7231 jmp(L_first_loop);
7232
7233 bind(L_first_loop_exit);
7234 }
7235
7236
7237 /**
7238 * Perform the following multiply add operation using BMI2 instructions
7239 * carry:sum = sum + op1*op2 + carry
7240 * op2 should be in rdx
7241 * op2 is preserved, all other registers are modified
7242 */
7243 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7244 // assert op2 is rdx
7245 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
7246 addq(sum, carry);
7247 adcq(tmp2, 0);
7248 addq(sum, op1);
7249 adcq(tmp2, 0);
7250 movq(carry, tmp2);
7251 }
7252
7253 /**
7254 * Perform the following multiply add operation:
7255 * carry:sum = sum + op1*op2 + carry
7256 * Preserves op1, op2 and modifies rest of registers
7257 */
7258 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7259 // rdx:rax = op1 * op2
7260 movq(raxReg, op2);
7261 mulq(op1);
7262
7263 // rdx:rax = sum + carry + rdx:rax
7264 addq(sum, carry);
7265 adcq(rdxReg, 0);
7266 addq(sum, raxReg);
7267 adcq(rdxReg, 0);
7268
7269 // carry:sum = rdx:sum
7270 movq(carry, rdxReg);
7271 }
7272
7273 /**
7274 * Add 64 bit long carry into z[] with carry propagation.
7275 * Preserves z and carry register values and modifies rest of registers.
7276 *
7277 */
7278 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7279 Label L_fourth_loop, L_fourth_loop_exit;
7280
7281 movl(tmp1, 1);
7282 subl(zlen, 2);
7283 addq(Address(z, zlen, Address::times_4, 0), carry);
7284
7285 bind(L_fourth_loop);
7286 jccb(Assembler::carryClear, L_fourth_loop_exit);
7287 subl(zlen, 2);
7288 jccb(Assembler::negative, L_fourth_loop_exit);
7289 addq(Address(z, zlen, Address::times_4, 0), tmp1);
7290 jmp(L_fourth_loop);
7291 bind(L_fourth_loop_exit);
7292 }
7293
7294 /**
7295 * Shift z[] left by 1 bit.
7296 * Preserves x, len, z and zlen registers and modifies rest of the registers.
7297 *
7298 */
7299 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7300
7301 Label L_fifth_loop, L_fifth_loop_exit;
7302
7303 // Fifth loop
7304 // Perform primitiveLeftShift(z, zlen, 1)
7305
7306 const Register prev_carry = tmp1;
7307 const Register new_carry = tmp4;
7308 const Register value = tmp2;
7309 const Register zidx = tmp3;
7310
7311 // int zidx, carry;
7312 // long value;
7313 // carry = 0;
7314 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7315 // (carry:value) = (z[i] << 1) | carry ;
7316 // z[i] = value;
7317 // }
7318
7319 movl(zidx, zlen);
7320 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7321
7322 bind(L_fifth_loop);
7323 decl(zidx); // Use decl to preserve carry flag
7324 decl(zidx);
7325 jccb(Assembler::negative, L_fifth_loop_exit);
7326
7327 if (UseBMI2Instructions) {
7328 movq(value, Address(z, zidx, Address::times_4, 0));
7329 rclq(value, 1);
7330 rorxq(value, value, 32);
7331 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7332 }
7333 else {
7334 // clear new_carry
7335 xorl(new_carry, new_carry);
7336
7337 // Shift z[i] by 1, or in previous carry and save new carry
7338 movq(value, Address(z, zidx, Address::times_4, 0));
7339 shlq(value, 1);
7340 adcl(new_carry, 0);
7341
7342 orq(value, prev_carry);
7343 rorq(value, 0x20);
7344 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7345
7346 // Set previous carry = new carry
7347 movl(prev_carry, new_carry);
7348 }
7349 jmp(L_fifth_loop);
7350
7351 bind(L_fifth_loop_exit);
7352 }
7353
7354
7355 /**
7356 * Code for BigInteger::squareToLen() intrinsic
7357 *
7358 * rdi: x
7359 * rsi: len
7360 * r8: z
7361 * rcx: zlen
7362 * r12: tmp1
7363 * r13: tmp2
7364 * r14: tmp3
7365 * r15: tmp4
7366 * rbx: tmp5
7367 *
7368 */
7369 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7370
7371 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
7372 push(tmp1);
7373 push(tmp2);
7374 push(tmp3);
7375 push(tmp4);
7376 push(tmp5);
7377
7378 // First loop
7379 // Store the squares, right shifted one bit (i.e., divided by 2).
7380 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7381
7382 // Add in off-diagonal sums.
7383 //
7384 // Second, third (nested) and fourth loops.
7385 // zlen +=2;
7386 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7387 // carry = 0;
7388 // long op2 = x[xidx:xidx+1];
7389 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7390 // k -= 2;
7391 // long op1 = x[j:j+1];
7392 // long sum = z[k:k+1];
7393 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7394 // z[k:k+1] = sum;
7395 // }
7396 // add_one_64(z, k, carry, tmp_regs);
7397 // }
7398
7399 const Register carry = tmp5;
7400 const Register sum = tmp3;
7401 const Register op1 = tmp4;
7402 Register op2 = tmp2;
7403
7404 push(zlen);
7405 push(len);
7406 addl(zlen,2);
7407 bind(L_second_loop);
7408 xorq(carry, carry);
7409 subl(zlen, 4);
7410 subl(len, 2);
7411 push(zlen);
7412 push(len);
7413 cmpl(len, 0);
7414 jccb(Assembler::lessEqual, L_second_loop_exit);
7415
7416 // Multiply an array by one 64 bit long.
7417 if (UseBMI2Instructions) {
7418 op2 = rdxReg;
7419 movq(op2, Address(x, len, Address::times_4, 0));
7420 rorxq(op2, op2, 32);
7421 }
7422 else {
7423 movq(op2, Address(x, len, Address::times_4, 0));
7424 rorq(op2, 32);
7425 }
7426
7427 bind(L_third_loop);
7428 decrementl(len);
7429 jccb(Assembler::negative, L_third_loop_exit);
7430 decrementl(len);
7431 jccb(Assembler::negative, L_last_x);
7432
7433 movq(op1, Address(x, len, Address::times_4, 0));
7434 rorq(op1, 32);
7435
7436 bind(L_multiply);
7437 subl(zlen, 2);
7438 movq(sum, Address(z, zlen, Address::times_4, 0));
7439
7440 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
7441 if (UseBMI2Instructions) {
7442 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
7443 }
7444 else {
7445 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7446 }
7447
7448 movq(Address(z, zlen, Address::times_4, 0), sum);
7449
7450 jmp(L_third_loop);
7451 bind(L_third_loop_exit);
7452
7453 // Fourth loop
7454 // Add 64 bit long carry into z with carry propagation.
7455 // Uses offsetted zlen.
7456 add_one_64(z, zlen, carry, tmp1);
7457
7458 pop(len);
7459 pop(zlen);
7460 jmp(L_second_loop);
7461
7462 // Next infrequent code is moved outside loops.
7463 bind(L_last_x);
7464 movl(op1, Address(x, 0));
7465 jmp(L_multiply);
7466
7467 bind(L_second_loop_exit);
7468 pop(len);
7469 pop(zlen);
7470 pop(len);
7471 pop(zlen);
7472
7473 // Fifth loop
7474 // Shift z left 1 bit.
7475 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7476
7477 // z[zlen-1] |= x[len-1] & 1;
7478 movl(tmp3, Address(x, len, Address::times_4, -4));
7479 andl(tmp3, 1);
7480 orl(Address(z, zlen, Address::times_4, -4), tmp3);
7481
7482 pop(tmp5);
7483 pop(tmp4);
7484 pop(tmp3);
7485 pop(tmp2);
7486 pop(tmp1);
7487 }
7488
7489 /**
7490 * Helper function for mul_add()
7491 * Multiply the in[] by int k and add to out[] starting at offset offs using
7492 * 128 bit by 32 bit multiply and return the carry in tmp5.
7493 * Only quad int aligned length of in[] is operated on in this function.
7494 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7495 * This function preserves out, in and k registers.
7496 * len and offset point to the appropriate index in "in" & "out" correspondingly
7497 * tmp5 has the carry.
7498 * other registers are temporary and are modified.
7499 *
7500 */
7501 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7502 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7503 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7504
7505 Label L_first_loop, L_first_loop_exit;
7506
7507 movl(tmp1, len);
7508 shrl(tmp1, 2);
7509
7510 bind(L_first_loop);
7511 subl(tmp1, 1);
7512 jccb(Assembler::negative, L_first_loop_exit);
7513
7514 subl(len, 4);
7515 subl(offset, 4);
7516
7517 Register op2 = tmp2;
7518 const Register sum = tmp3;
7519 const Register op1 = tmp4;
7520 const Register carry = tmp5;
7521
7522 if (UseBMI2Instructions) {
7523 op2 = rdxReg;
7524 }
7525
7526 movq(op1, Address(in, len, Address::times_4, 8));
7527 rorq(op1, 32);
7528 movq(sum, Address(out, offset, Address::times_4, 8));
7529 rorq(sum, 32);
7530 if (UseBMI2Instructions) {
7531 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7532 }
7533 else {
7534 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7535 }
7536 // Store back in big endian from little endian
7537 rorq(sum, 0x20);
7538 movq(Address(out, offset, Address::times_4, 8), sum);
7539
7540 movq(op1, Address(in, len, Address::times_4, 0));
7541 rorq(op1, 32);
7542 movq(sum, Address(out, offset, Address::times_4, 0));
7543 rorq(sum, 32);
7544 if (UseBMI2Instructions) {
7545 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7546 }
7547 else {
7548 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7549 }
7550 // Store back in big endian from little endian
7551 rorq(sum, 0x20);
7552 movq(Address(out, offset, Address::times_4, 0), sum);
7553
7554 jmp(L_first_loop);
7555 bind(L_first_loop_exit);
7556 }
7557
7558 /**
7559 * Code for BigInteger::mulAdd() intrinsic
7560 *
7561 * rdi: out
7562 * rsi: in
7563 * r11: offs (out.length - offset)
7564 * rcx: len
7565 * r8: k
7566 * r12: tmp1
7567 * r13: tmp2
7568 * r14: tmp3
7569 * r15: tmp4
7570 * rbx: tmp5
7571 * Multiply the in[] by word k and add to out[], return the carry in rax
7572 */
7573 void MacroAssembler::mul_add(Register out, Register in, Register offs,
7574 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
7575 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7576
7577 Label L_carry, L_last_in, L_done;
7578
7579 // carry = 0;
7580 // for (int j=len-1; j >= 0; j--) {
7581 // long product = (in[j] & LONG_MASK) * kLong +
7582 // (out[offs] & LONG_MASK) + carry;
7583 // out[offs--] = (int)product;
7584 // carry = product >>> 32;
7585 // }
7586 //
7587 push(tmp1);
7588 push(tmp2);
7589 push(tmp3);
7590 push(tmp4);
7591 push(tmp5);
7592
7593 Register op2 = tmp2;
7594 const Register sum = tmp3;
7595 const Register op1 = tmp4;
7596 const Register carry = tmp5;
7597
7598 if (UseBMI2Instructions) {
7599 op2 = rdxReg;
7600 movl(op2, k);
7601 }
7602 else {
7603 movl(op2, k);
7604 }
7605
7606 xorq(carry, carry);
7607
7608 //First loop
7609
7610 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7611 //The carry is in tmp5
7612 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7613
7614 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7615 decrementl(len);
7616 jccb(Assembler::negative, L_carry);
7617 decrementl(len);
7618 jccb(Assembler::negative, L_last_in);
7619
7620 movq(op1, Address(in, len, Address::times_4, 0));
7621 rorq(op1, 32);
7622
7623 subl(offs, 2);
7624 movq(sum, Address(out, offs, Address::times_4, 0));
7625 rorq(sum, 32);
7626
7627 if (UseBMI2Instructions) {
7628 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7629 }
7630 else {
7631 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7632 }
7633
7634 // Store back in big endian from little endian
7635 rorq(sum, 0x20);
7636 movq(Address(out, offs, Address::times_4, 0), sum);
7637
7638 testl(len, len);
7639 jccb(Assembler::zero, L_carry);
7640
7641 //Multiply the last in[] entry, if any
7642 bind(L_last_in);
7643 movl(op1, Address(in, 0));
7644 movl(sum, Address(out, offs, Address::times_4, -4));
7645
7646 movl(raxReg, k);
7647 mull(op1); //tmp4 * eax -> edx:eax
7648 addl(sum, carry);
7649 adcl(rdxReg, 0);
7650 addl(sum, raxReg);
7651 adcl(rdxReg, 0);
7652 movl(carry, rdxReg);
7653
7654 movl(Address(out, offs, Address::times_4, -4), sum);
7655
7656 bind(L_carry);
7657 //return tmp5/carry as carry in rax
7658 movl(rax, carry);
7659
7660 bind(L_done);
7661 pop(tmp5);
7662 pop(tmp4);
7663 pop(tmp3);
7664 pop(tmp2);
7665 pop(tmp1);
7666 }
7667
7668 /**
7669 * Emits code to update CRC-32 with a byte value according to constants in table
7670 *
7671 * @param [in,out]crc Register containing the crc.
7672 * @param [in]val Register containing the byte to fold into the CRC.
7673 * @param [in]table Register containing the table of crc constants.
7674 *
7675 * uint32_t crc;
7676 * val = crc_table[(val ^ crc) & 0xFF];
7677 * crc = val ^ (crc >> 8);
7678 *
7679 */
7680 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7681 xorl(val, crc);
7682 andl(val, 0xFF);
7683 shrl(crc, 8); // unsigned shift
7684 xorl(crc, Address(table, val, Address::times_4, 0));
7685 }
7686
7687 /**
7688 * Fold 128-bit data chunk
7689 */
7690 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7691 if (UseAVX > 0) {
7692 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7693 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7694 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7695 pxor(xcrc, xtmp);
7696 } else {
7697 movdqa(xtmp, xcrc);
7698 pclmulhdq(xtmp, xK); // [123:64]
7699 pclmulldq(xcrc, xK); // [63:0]
7700 pxor(xcrc, xtmp);
7701 movdqu(xtmp, Address(buf, offset));
7702 pxor(xcrc, xtmp);
7703 }
7704 }
7705
7706 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7707 if (UseAVX > 0) {
7708 vpclmulhdq(xtmp, xK, xcrc);
7709 vpclmulldq(xcrc, xK, xcrc);
7710 pxor(xcrc, xbuf);
7711 pxor(xcrc, xtmp);
7712 } else {
7713 movdqa(xtmp, xcrc);
7714 pclmulhdq(xtmp, xK);
7715 pclmulldq(xcrc, xK);
7716 pxor(xcrc, xbuf);
7717 pxor(xcrc, xtmp);
7718 }
7719 }
7720
7721 /**
7722 * 8-bit folds to compute 32-bit CRC
7723 *
7724 * uint64_t xcrc;
7725 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7726 */
7727 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7728 movdl(tmp, xcrc);
7729 andl(tmp, 0xFF);
7730 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7731 psrldq(xcrc, 1); // unsigned shift one byte
7732 pxor(xcrc, xtmp);
7733 }
7734
7735 /**
7736 * uint32_t crc;
7737 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7738 */
7739 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7740 movl(tmp, crc);
7741 andl(tmp, 0xFF);
7742 shrl(crc, 8);
7743 xorl(crc, Address(table, tmp, Address::times_4, 0));
7744 }
7745
7746 /**
7747 * @param crc register containing existing CRC (32-bit)
7748 * @param buf register pointing to input byte buffer (byte*)
7749 * @param len register containing number of bytes
7750 * @param table register that will contain address of CRC table
7751 * @param tmp scratch register
7752 */
7753 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7754 assert_different_registers(crc, buf, len, table, tmp, rax);
7755
7756 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7757 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7758
7759 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7760 // context for the registers used, where all instructions below are using 128-bit mode
7761 // On EVEX without VL and BW, these instructions will all be AVX.
7762 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7763 notl(crc); // ~crc
7764 cmpl(len, 16);
7765 jcc(Assembler::less, L_tail);
7766
7767 // Align buffer to 16 bytes
7768 movl(tmp, buf);
7769 andl(tmp, 0xF);
7770 jccb(Assembler::zero, L_aligned);
7771 subl(tmp, 16);
7772 addl(len, tmp);
7773
7774 align(4);
7775 BIND(L_align_loop);
7776 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7777 update_byte_crc32(crc, rax, table);
7778 increment(buf);
7779 incrementl(tmp);
7780 jccb(Assembler::less, L_align_loop);
7781
7782 BIND(L_aligned);
7783 movl(tmp, len); // save
7784 shrl(len, 4);
7785 jcc(Assembler::zero, L_tail_restore);
7786
7787 // Fold crc into first bytes of vector
7788 movdqa(xmm1, Address(buf, 0));
7789 movdl(rax, xmm1);
7790 xorl(crc, rax);
7791 if (VM_Version::supports_sse4_1()) {
7792 pinsrd(xmm1, crc, 0);
7793 } else {
7794 pinsrw(xmm1, crc, 0);
7795 shrl(crc, 16);
7796 pinsrw(xmm1, crc, 1);
7797 }
7798 addptr(buf, 16);
7799 subl(len, 4); // len > 0
7800 jcc(Assembler::less, L_fold_tail);
7801
7802 movdqa(xmm2, Address(buf, 0));
7803 movdqa(xmm3, Address(buf, 16));
7804 movdqa(xmm4, Address(buf, 32));
7805 addptr(buf, 48);
7806 subl(len, 3);
7807 jcc(Assembler::lessEqual, L_fold_512b);
7808
7809 // Fold total 512 bits of polynomial on each iteration,
7810 // 128 bits per each of 4 parallel streams.
7811 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
7812
7813 align32();
7814 BIND(L_fold_512b_loop);
7815 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7816 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7817 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7818 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7819 addptr(buf, 64);
7820 subl(len, 4);
7821 jcc(Assembler::greater, L_fold_512b_loop);
7822
7823 // Fold 512 bits to 128 bits.
7824 BIND(L_fold_512b);
7825 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7826 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7827 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7828 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7829
7830 // Fold the rest of 128 bits data chunks
7831 BIND(L_fold_tail);
7832 addl(len, 3);
7833 jccb(Assembler::lessEqual, L_fold_128b);
7834 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7835
7836 BIND(L_fold_tail_loop);
7837 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7838 addptr(buf, 16);
7839 decrementl(len);
7840 jccb(Assembler::greater, L_fold_tail_loop);
7841
7842 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7843 BIND(L_fold_128b);
7844 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
7845 if (UseAVX > 0) {
7846 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7847 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7848 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7849 } else {
7850 movdqa(xmm2, xmm0);
7851 pclmulqdq(xmm2, xmm1, 0x1);
7852 movdqa(xmm3, xmm0);
7853 pand(xmm3, xmm2);
7854 pclmulqdq(xmm0, xmm3, 0x1);
7855 }
7856 psrldq(xmm1, 8);
7857 psrldq(xmm2, 4);
7858 pxor(xmm0, xmm1);
7859 pxor(xmm0, xmm2);
7860
7861 // 8 8-bit folds to compute 32-bit CRC.
7862 for (int j = 0; j < 4; j++) {
7863 fold_8bit_crc32(xmm0, table, xmm1, rax);
7864 }
7865 movdl(crc, xmm0); // mov 32 bits to general register
7866 for (int j = 0; j < 4; j++) {
7867 fold_8bit_crc32(crc, table, rax);
7868 }
7869
7870 BIND(L_tail_restore);
7871 movl(len, tmp); // restore
7872 BIND(L_tail);
7873 andl(len, 0xf);
7874 jccb(Assembler::zero, L_exit);
7875
7876 // Fold the rest of bytes
7877 align(4);
7878 BIND(L_tail_loop);
7879 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7880 update_byte_crc32(crc, rax, table);
7881 increment(buf);
7882 decrementl(len);
7883 jccb(Assembler::greater, L_tail_loop);
7884
7885 BIND(L_exit);
7886 notl(crc); // ~c
7887 }
7888
7889 // Helper function for AVX 512 CRC32
7890 // Fold 512-bit data chunks
7891 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7892 Register pos, int offset) {
7893 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7894 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7895 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7896 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7897 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7898 }
7899
7900 // Helper function for AVX 512 CRC32
7901 // Compute CRC32 for < 256B buffers
7902 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7903 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7904 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7905
7906 Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7907 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7908 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7909
7910 // check if there is enough buffer to be able to fold 16B at a time
7911 cmpl(len, 32);
7912 jcc(Assembler::less, L_less_than_32);
7913
7914 // if there is, load the constants
7915 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
7916 movdl(xmm0, crc); // get the initial crc value
7917 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7918 pxor(xmm7, xmm0);
7919
7920 // update the buffer pointer
7921 addl(pos, 16);
7922 //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7923 subl(len, 32);
7924 jmp(L_16B_reduction_loop);
7925
7926 bind(L_less_than_32);
7927 //mov initial crc to the return value. this is necessary for zero - length buffers.
7928 movl(rax, crc);
7929 testl(len, len);
7930 jcc(Assembler::equal, L_cleanup);
7931
7932 movdl(xmm0, crc); //get the initial crc value
7933
7934 cmpl(len, 16);
7935 jcc(Assembler::equal, L_exact_16_left);
7936 jcc(Assembler::less, L_less_than_16_left);
7937
7938 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7939 pxor(xmm7, xmm0); //xor the initial crc value
7940 addl(pos, 16);
7941 subl(len, 16);
7942 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
7943 jmp(L_get_last_two_xmms);
7944
7945 bind(L_less_than_16_left);
7946 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7947 pxor(xmm1, xmm1);
7948 movptr(tmp1, rsp);
7949 movdqu(Address(tmp1, 0 * 16), xmm1);
7950
7951 cmpl(len, 4);
7952 jcc(Assembler::less, L_only_less_than_4);
7953
7954 //backup the counter value
7955 movl(tmp2, len);
7956 cmpl(len, 8);
7957 jcc(Assembler::less, L_less_than_8_left);
7958
7959 //load 8 Bytes
7960 movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7961 movq(Address(tmp1, 0 * 16), rax);
7962 addptr(tmp1, 8);
7963 subl(len, 8);
7964 addl(pos, 8);
7965
7966 bind(L_less_than_8_left);
7967 cmpl(len, 4);
7968 jcc(Assembler::less, L_less_than_4_left);
7969
7970 //load 4 Bytes
7971 movl(rax, Address(buf, pos, Address::times_1, 0));
7972 movl(Address(tmp1, 0 * 16), rax);
7973 addptr(tmp1, 4);
7974 subl(len, 4);
7975 addl(pos, 4);
7976
7977 bind(L_less_than_4_left);
7978 cmpl(len, 2);
7979 jcc(Assembler::less, L_less_than_2_left);
7980
7981 // load 2 Bytes
7982 movw(rax, Address(buf, pos, Address::times_1, 0));
7983 movl(Address(tmp1, 0 * 16), rax);
7984 addptr(tmp1, 2);
7985 subl(len, 2);
7986 addl(pos, 2);
7987
7988 bind(L_less_than_2_left);
7989 cmpl(len, 1);
7990 jcc(Assembler::less, L_zero_left);
7991
7992 // load 1 Byte
7993 movb(rax, Address(buf, pos, Address::times_1, 0));
7994 movb(Address(tmp1, 0 * 16), rax);
7995
7996 bind(L_zero_left);
7997 movdqu(xmm7, Address(rsp, 0));
7998 pxor(xmm7, xmm0); //xor the initial crc value
7999
8000 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8001 movdqu(xmm0, Address(rax, tmp2));
8002 pshufb(xmm7, xmm0);
8003 jmp(L_128_done);
8004
8005 bind(L_exact_16_left);
8006 movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
8007 pxor(xmm7, xmm0); //xor the initial crc value
8008 jmp(L_128_done);
8009
8010 bind(L_only_less_than_4);
8011 cmpl(len, 3);
8012 jcc(Assembler::less, L_only_less_than_3);
8013
8014 // load 3 Bytes
8015 movb(rax, Address(buf, pos, Address::times_1, 0));
8016 movb(Address(tmp1, 0), rax);
8017
8018 movb(rax, Address(buf, pos, Address::times_1, 1));
8019 movb(Address(tmp1, 1), rax);
8020
8021 movb(rax, Address(buf, pos, Address::times_1, 2));
8022 movb(Address(tmp1, 2), rax);
8023
8024 movdqu(xmm7, Address(rsp, 0));
8025 pxor(xmm7, xmm0); //xor the initial crc value
8026
8027 pslldq(xmm7, 0x5);
8028 jmp(L_barrett);
8029 bind(L_only_less_than_3);
8030 cmpl(len, 2);
8031 jcc(Assembler::less, L_only_less_than_2);
8032
8033 // load 2 Bytes
8034 movb(rax, Address(buf, pos, Address::times_1, 0));
8035 movb(Address(tmp1, 0), rax);
8036
8037 movb(rax, Address(buf, pos, Address::times_1, 1));
8038 movb(Address(tmp1, 1), rax);
8039
8040 movdqu(xmm7, Address(rsp, 0));
8041 pxor(xmm7, xmm0); //xor the initial crc value
8042
8043 pslldq(xmm7, 0x6);
8044 jmp(L_barrett);
8045
8046 bind(L_only_less_than_2);
8047 //load 1 Byte
8048 movb(rax, Address(buf, pos, Address::times_1, 0));
8049 movb(Address(tmp1, 0), rax);
8050
8051 movdqu(xmm7, Address(rsp, 0));
8052 pxor(xmm7, xmm0); //xor the initial crc value
8053
8054 pslldq(xmm7, 0x7);
8055 }
8056
8057 /**
8058 * Compute CRC32 using AVX512 instructions
8059 * param crc register containing existing CRC (32-bit)
8060 * param buf register pointing to input byte buffer (byte*)
8061 * param len register containing number of bytes
8062 * param table address of crc or crc32c table
8063 * param tmp1 scratch register
8064 * param tmp2 scratch register
8065 * return rax result register
8066 *
8067 * This routine is identical for crc32c with the exception of the precomputed constant
8068 * table which will be passed as the table argument. The calculation steps are
8069 * the same for both variants.
8070 */
8071 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
8072 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
8073
8074 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8075 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8076 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
8077 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
8078 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
8079
8080 const Register pos = r12;
8081 push(r12);
8082 subptr(rsp, 16 * 2 + 8);
8083
8084 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8085 // context for the registers used, where all instructions below are using 128-bit mode
8086 // On EVEX without VL and BW, these instructions will all be AVX.
8087 movl(pos, 0);
8088
8089 // check if smaller than 256B
8090 cmpl(len, 256);
8091 jcc(Assembler::less, L_less_than_256);
8092
8093 // load the initial crc value
8094 movdl(xmm10, crc);
8095
8096 // receive the initial 64B data, xor the initial crc value
8097 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
8098 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
8099 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
8100 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
8101
8102 subl(len, 256);
8103 cmpl(len, 256);
8104 jcc(Assembler::less, L_fold_128_B_loop);
8105
8106 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
8107 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
8108 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
8109 subl(len, 256);
8110
8111 bind(L_fold_256_B_loop);
8112 addl(pos, 256);
8113 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
8114 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
8115 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
8116 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
8117
8118 subl(len, 256);
8119 jcc(Assembler::greaterEqual, L_fold_256_B_loop);
8120
8121 // Fold 256 into 128
8122 addl(pos, 256);
8123 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
8124 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
8125 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
8126
8127 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
8128 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
8129 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
8130
8131 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
8132 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
8133
8134 addl(len, 128);
8135 jmp(L_fold_128_B_register);
8136
8137 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
8138 // loop will fold 128B at a time until we have 128 + y Bytes of buffer
8139
8140 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
8141 bind(L_fold_128_B_loop);
8142 addl(pos, 128);
8143 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
8144 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
8145
8146 subl(len, 128);
8147 jcc(Assembler::greaterEqual, L_fold_128_B_loop);
8148
8149 addl(pos, 128);
8150
8151 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
8152 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
8153 bind(L_fold_128_B_register);
8154 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
8155 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
8156 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
8157 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
8158 // save last that has no multiplicand
8159 vextracti64x2(xmm7, xmm4, 3);
8160
8161 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
8162 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
8163 // Needed later in reduction loop
8164 movdqu(xmm10, Address(table, 1 * 16));
8165 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
8166 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
8167
8168 // Swap 1,0,3,2 - 01 00 11 10
8169 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
8170 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
8171 vextracti128(xmm5, xmm8, 1);
8172 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
8173
8174 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
8175 // instead of a cmp instruction, we use the negative flag with the jl instruction
8176 addl(len, 128 - 16);
8177 jcc(Assembler::less, L_final_reduction_for_128);
8178
8179 bind(L_16B_reduction_loop);
8180 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8181 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8182 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8183 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
8184 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8185 addl(pos, 16);
8186 subl(len, 16);
8187 jcc(Assembler::greaterEqual, L_16B_reduction_loop);
8188
8189 bind(L_final_reduction_for_128);
8190 addl(len, 16);
8191 jcc(Assembler::equal, L_128_done);
8192
8193 bind(L_get_last_two_xmms);
8194 movdqu(xmm2, xmm7);
8195 addl(pos, len);
8196 movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
8197 subl(pos, len);
8198
8199 // get rid of the extra data that was loaded before
8200 // load the shift constant
8201 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8202 movdqu(xmm0, Address(rax, len));
8203 addl(rax, len);
8204
8205 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8206 //Change mask to 512
8207 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
8208 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
8209
8210 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
8211 vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8212 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8213 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8214 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
8215
8216 bind(L_128_done);
8217 // compute crc of a 128-bit value
8218 movdqu(xmm10, Address(table, 3 * 16));
8219 movdqu(xmm0, xmm7);
8220
8221 // 64b fold
8222 vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
8223 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
8224 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8225
8226 // 32b fold
8227 movdqu(xmm0, xmm7);
8228 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
8229 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8230 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8231 jmp(L_barrett);
8232
8233 bind(L_less_than_256);
8234 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
8235
8236 //barrett reduction
8237 bind(L_barrett);
8238 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
8239 movdqu(xmm1, xmm7);
8240 movdqu(xmm2, xmm7);
8241 movdqu(xmm10, Address(table, 4 * 16));
8242
8243 pclmulqdq(xmm7, xmm10, 0x0);
8244 pxor(xmm7, xmm2);
8245 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
8246 movdqu(xmm2, xmm7);
8247 pclmulqdq(xmm7, xmm10, 0x10);
8248 pxor(xmm7, xmm2);
8249 pxor(xmm7, xmm1);
8250 pextrd(crc, xmm7, 2);
8251
8252 bind(L_cleanup);
8253 addptr(rsp, 16 * 2 + 8);
8254 pop(r12);
8255 }
8256
8257 // S. Gueron / Information Processing Letters 112 (2012) 184
8258 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8259 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8260 // Output: the 64-bit carry-less product of B * CONST
8261 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8262 Register tmp1, Register tmp2, Register tmp3) {
8263 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8264 if (n > 0) {
8265 addq(tmp3, n * 256 * 8);
8266 }
8267 // Q1 = TABLEExt[n][B & 0xFF];
8268 movl(tmp1, in);
8269 andl(tmp1, 0x000000FF);
8270 shll(tmp1, 3);
8271 addq(tmp1, tmp3);
8272 movq(tmp1, Address(tmp1, 0));
8273
8274 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
8275 movl(tmp2, in);
8276 shrl(tmp2, 8);
8277 andl(tmp2, 0x000000FF);
8278 shll(tmp2, 3);
8279 addq(tmp2, tmp3);
8280 movq(tmp2, Address(tmp2, 0));
8281
8282 shlq(tmp2, 8);
8283 xorq(tmp1, tmp2);
8284
8285 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
8286 movl(tmp2, in);
8287 shrl(tmp2, 16);
8288 andl(tmp2, 0x000000FF);
8289 shll(tmp2, 3);
8290 addq(tmp2, tmp3);
8291 movq(tmp2, Address(tmp2, 0));
8292
8293 shlq(tmp2, 16);
8294 xorq(tmp1, tmp2);
8295
8296 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
8297 shrl(in, 24);
8298 andl(in, 0x000000FF);
8299 shll(in, 3);
8300 addq(in, tmp3);
8301 movq(in, Address(in, 0));
8302
8303 shlq(in, 24);
8304 xorq(in, tmp1);
8305 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8306 }
8307
8308 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8309 Register in_out,
8310 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8311 XMMRegister w_xtmp2,
8312 Register tmp1,
8313 Register n_tmp2, Register n_tmp3) {
8314 if (is_pclmulqdq_supported) {
8315 movdl(w_xtmp1, in_out); // modified blindly
8316
8317 movl(tmp1, const_or_pre_comp_const_index);
8318 movdl(w_xtmp2, tmp1);
8319 pclmulqdq(w_xtmp1, w_xtmp2, 0);
8320
8321 movdq(in_out, w_xtmp1);
8322 } else {
8323 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8324 }
8325 }
8326
8327 // Recombination Alternative 2: No bit-reflections
8328 // T1 = (CRC_A * U1) << 1
8329 // T2 = (CRC_B * U2) << 1
8330 // C1 = T1 >> 32
8331 // C2 = T2 >> 32
8332 // T1 = T1 & 0xFFFFFFFF
8333 // T2 = T2 & 0xFFFFFFFF
8334 // T1 = CRC32(0, T1)
8335 // T2 = CRC32(0, T2)
8336 // C1 = C1 ^ T1
8337 // C2 = C2 ^ T2
8338 // CRC = C1 ^ C2 ^ CRC_C
8339 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8340 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8341 Register tmp1, Register tmp2,
8342 Register n_tmp3) {
8343 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8344 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8345 shlq(in_out, 1);
8346 movl(tmp1, in_out);
8347 shrq(in_out, 32);
8348 xorl(tmp2, tmp2);
8349 crc32(tmp2, tmp1, 4);
8350 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8351 shlq(in1, 1);
8352 movl(tmp1, in1);
8353 shrq(in1, 32);
8354 xorl(tmp2, tmp2);
8355 crc32(tmp2, tmp1, 4);
8356 xorl(in1, tmp2);
8357 xorl(in_out, in1);
8358 xorl(in_out, in2);
8359 }
8360
8361 // Set N to predefined value
8362 // Subtract from a length of a buffer
8363 // execute in a loop:
8364 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8365 // for i = 1 to N do
8366 // CRC_A = CRC32(CRC_A, A[i])
8367 // CRC_B = CRC32(CRC_B, B[i])
8368 // CRC_C = CRC32(CRC_C, C[i])
8369 // end for
8370 // Recombine
8371 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8372 Register in_out1, Register in_out2, Register in_out3,
8373 Register tmp1, Register tmp2, Register tmp3,
8374 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8375 Register tmp4, Register tmp5,
8376 Register n_tmp6) {
8377 Label L_processPartitions;
8378 Label L_processPartition;
8379 Label L_exit;
8380
8381 bind(L_processPartitions);
8382 cmpl(in_out1, 3 * size);
8383 jcc(Assembler::less, L_exit);
8384 xorl(tmp1, tmp1);
8385 xorl(tmp2, tmp2);
8386 movq(tmp3, in_out2);
8387 addq(tmp3, size);
8388
8389 bind(L_processPartition);
8390 crc32(in_out3, Address(in_out2, 0), 8);
8391 crc32(tmp1, Address(in_out2, size), 8);
8392 crc32(tmp2, Address(in_out2, size * 2), 8);
8393 addq(in_out2, 8);
8394 cmpq(in_out2, tmp3);
8395 jcc(Assembler::less, L_processPartition);
8396 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8397 w_xtmp1, w_xtmp2, w_xtmp3,
8398 tmp4, tmp5,
8399 n_tmp6);
8400 addq(in_out2, 2 * size);
8401 subl(in_out1, 3 * size);
8402 jmp(L_processPartitions);
8403
8404 bind(L_exit);
8405 }
8406
8407 // Algorithm 2: Pipelined usage of the CRC32 instruction.
8408 // Input: A buffer I of L bytes.
8409 // Output: the CRC32C value of the buffer.
8410 // Notations:
8411 // Write L = 24N + r, with N = floor (L/24).
8412 // r = L mod 24 (0 <= r < 24).
8413 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8414 // N quadwords, and R consists of r bytes.
8415 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8416 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8417 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8418 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8419 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8420 Register tmp1, Register tmp2, Register tmp3,
8421 Register tmp4, Register tmp5, Register tmp6,
8422 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8423 bool is_pclmulqdq_supported) {
8424 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8425 Label L_wordByWord;
8426 Label L_byteByByteProlog;
8427 Label L_byteByByte;
8428 Label L_exit;
8429
8430 if (is_pclmulqdq_supported ) {
8431 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr();
8432 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1);
8433
8434 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2);
8435 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3);
8436
8437 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4);
8438 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5);
8439 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8440 } else {
8441 const_or_pre_comp_const_index[0] = 1;
8442 const_or_pre_comp_const_index[1] = 0;
8443
8444 const_or_pre_comp_const_index[2] = 3;
8445 const_or_pre_comp_const_index[3] = 2;
8446
8447 const_or_pre_comp_const_index[4] = 5;
8448 const_or_pre_comp_const_index[5] = 4;
8449 }
8450 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8451 in2, in1, in_out,
8452 tmp1, tmp2, tmp3,
8453 w_xtmp1, w_xtmp2, w_xtmp3,
8454 tmp4, tmp5,
8455 tmp6);
8456 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8457 in2, in1, in_out,
8458 tmp1, tmp2, tmp3,
8459 w_xtmp1, w_xtmp2, w_xtmp3,
8460 tmp4, tmp5,
8461 tmp6);
8462 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8463 in2, in1, in_out,
8464 tmp1, tmp2, tmp3,
8465 w_xtmp1, w_xtmp2, w_xtmp3,
8466 tmp4, tmp5,
8467 tmp6);
8468 movl(tmp1, in2);
8469 andl(tmp1, 0x00000007);
8470 negl(tmp1);
8471 addl(tmp1, in2);
8472 addq(tmp1, in1);
8473
8474 cmpq(in1, tmp1);
8475 jccb(Assembler::greaterEqual, L_byteByByteProlog);
8476 align(16);
8477 BIND(L_wordByWord);
8478 crc32(in_out, Address(in1, 0), 8);
8479 addq(in1, 8);
8480 cmpq(in1, tmp1);
8481 jcc(Assembler::less, L_wordByWord);
8482
8483 BIND(L_byteByByteProlog);
8484 andl(in2, 0x00000007);
8485 movl(tmp2, 1);
8486
8487 cmpl(tmp2, in2);
8488 jccb(Assembler::greater, L_exit);
8489 BIND(L_byteByByte);
8490 crc32(in_out, Address(in1, 0), 1);
8491 incq(in1);
8492 incl(tmp2);
8493 cmpl(tmp2, in2);
8494 jcc(Assembler::lessEqual, L_byteByByte);
8495
8496 BIND(L_exit);
8497 }
8498 #undef BIND
8499 #undef BLOCK_COMMENT
8500
8501 // Compress char[] array to byte[].
8502 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
8503 // Return the array length if every element in array can be encoded,
8504 // otherwise, the index of first non-latin1 (> 0xff) character.
8505 // @IntrinsicCandidate
8506 // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8507 // for (int i = 0; i < len; i++) {
8508 // char c = src[srcOff];
8509 // if (c > 0xff) {
8510 // return i; // return index of non-latin1 char
8511 // }
8512 // dst[dstOff] = (byte)c;
8513 // srcOff++;
8514 // dstOff++;
8515 // }
8516 // return len;
8517 // }
8518 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8519 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8520 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8521 Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8522 Label copy_chars_loop, done, reset_sp, copy_tail;
8523
8524 // rsi: src
8525 // rdi: dst
8526 // rdx: len
8527 // rcx: tmp5
8528 // rax: result
8529
8530 // rsi holds start addr of source char[] to be compressed
8531 // rdi holds start addr of destination byte[]
8532 // rdx holds length
8533
8534 assert(len != result, "");
8535
8536 // save length for return
8537 movl(result, len);
8538
8539 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8540 VM_Version::supports_avx512vlbw() &&
8541 VM_Version::supports_bmi2()) {
8542
8543 Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
8544
8545 // alignment
8546 Label post_alignment;
8547
8548 // if length of the string is less than 32, handle it the old fashioned way
8549 testl(len, -32);
8550 jcc(Assembler::zero, below_threshold);
8551
8552 // First check whether a character is compressible ( <= 0xFF).
8553 // Create mask to test for Unicode chars inside zmm vector
8554 movl(tmp5, 0x00FF);
8555 evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
8556
8557 testl(len, -64);
8558 jccb(Assembler::zero, post_alignment);
8559
8560 movl(tmp5, dst);
8561 andl(tmp5, (32 - 1));
8562 negl(tmp5);
8563 andl(tmp5, (32 - 1));
8564
8565 // bail out when there is nothing to be done
8566 testl(tmp5, 0xFFFFFFFF);
8567 jccb(Assembler::zero, post_alignment);
8568
8569 // ~(~0 << len), where len is the # of remaining elements to process
8570 movl(len, 0xFFFFFFFF);
8571 shlxl(len, len, tmp5);
8572 notl(len);
8573 kmovdl(mask2, len);
8574 movl(len, result);
8575
8576 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8577 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8578 ktestd(mask1, mask2);
8579 jcc(Assembler::carryClear, copy_tail);
8580
8581 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8582
8583 addptr(src, tmp5);
8584 addptr(src, tmp5);
8585 addptr(dst, tmp5);
8586 subl(len, tmp5);
8587
8588 bind(post_alignment);
8589 // end of alignment
8590
8591 movl(tmp5, len);
8592 andl(tmp5, (32 - 1)); // tail count (in chars)
8593 andl(len, ~(32 - 1)); // vector count (in chars)
8594 jccb(Assembler::zero, copy_loop_tail);
8595
8596 lea(src, Address(src, len, Address::times_2));
8597 lea(dst, Address(dst, len, Address::times_1));
8598 negptr(len);
8599
8600 bind(copy_32_loop);
8601 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
8602 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8603 kortestdl(mask1, mask1);
8604 jccb(Assembler::carryClear, reset_for_copy_tail);
8605
8606 // All elements in current processed chunk are valid candidates for
8607 // compression. Write a truncated byte elements to the memory.
8608 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8609 addptr(len, 32);
8610 jccb(Assembler::notZero, copy_32_loop);
8611
8612 bind(copy_loop_tail);
8613 // bail out when there is nothing to be done
8614 testl(tmp5, 0xFFFFFFFF);
8615 jcc(Assembler::zero, done);
8616
8617 movl(len, tmp5);
8618
8619 // ~(~0 << len), where len is the # of remaining elements to process
8620 movl(tmp5, 0xFFFFFFFF);
8621 shlxl(tmp5, tmp5, len);
8622 notl(tmp5);
8623
8624 kmovdl(mask2, tmp5);
8625
8626 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8627 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8628 ktestd(mask1, mask2);
8629 jcc(Assembler::carryClear, copy_tail);
8630
8631 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8632 jmp(done);
8633
8634 bind(reset_for_copy_tail);
8635 lea(src, Address(src, tmp5, Address::times_2));
8636 lea(dst, Address(dst, tmp5, Address::times_1));
8637 subptr(len, tmp5);
8638 jmp(copy_chars_loop);
8639
8640 bind(below_threshold);
8641 }
8642
8643 if (UseSSE42Intrinsics) {
8644 Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
8645
8646 // vectored compression
8647 testl(len, 0xfffffff8);
8648 jcc(Assembler::zero, copy_tail);
8649
8650 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
8651 movdl(tmp1Reg, tmp5);
8652 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
8653
8654 andl(len, 0xfffffff0);
8655 jccb(Assembler::zero, copy_16);
8656
8657 // compress 16 chars per iter
8658 pxor(tmp4Reg, tmp4Reg);
8659
8660 lea(src, Address(src, len, Address::times_2));
8661 lea(dst, Address(dst, len, Address::times_1));
8662 negptr(len);
8663
8664 bind(copy_32_loop);
8665 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
8666 por(tmp4Reg, tmp2Reg);
8667 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8668 por(tmp4Reg, tmp3Reg);
8669 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
8670 jccb(Assembler::notZero, reset_for_copy_tail);
8671 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
8672 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8673 addptr(len, 16);
8674 jccb(Assembler::notZero, copy_32_loop);
8675
8676 // compress next vector of 8 chars (if any)
8677 bind(copy_16);
8678 // len = 0
8679 testl(result, 0x00000008); // check if there's a block of 8 chars to compress
8680 jccb(Assembler::zero, copy_tail_sse);
8681
8682 pxor(tmp3Reg, tmp3Reg);
8683
8684 movdqu(tmp2Reg, Address(src, 0));
8685 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8686 jccb(Assembler::notZero, reset_for_copy_tail);
8687 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
8688 movq(Address(dst, 0), tmp2Reg);
8689 addptr(src, 16);
8690 addptr(dst, 8);
8691 jmpb(copy_tail_sse);
8692
8693 bind(reset_for_copy_tail);
8694 movl(tmp5, result);
8695 andl(tmp5, 0x0000000f);
8696 lea(src, Address(src, tmp5, Address::times_2));
8697 lea(dst, Address(dst, tmp5, Address::times_1));
8698 subptr(len, tmp5);
8699 jmpb(copy_chars_loop);
8700
8701 bind(copy_tail_sse);
8702 movl(len, result);
8703 andl(len, 0x00000007); // tail count (in chars)
8704 }
8705 // compress 1 char per iter
8706 bind(copy_tail);
8707 testl(len, len);
8708 jccb(Assembler::zero, done);
8709 lea(src, Address(src, len, Address::times_2));
8710 lea(dst, Address(dst, len, Address::times_1));
8711 negptr(len);
8712
8713 bind(copy_chars_loop);
8714 load_unsigned_short(tmp5, Address(src, len, Address::times_2));
8715 testl(tmp5, 0xff00); // check if Unicode char
8716 jccb(Assembler::notZero, reset_sp);
8717 movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte
8718 increment(len);
8719 jccb(Assembler::notZero, copy_chars_loop);
8720
8721 // add len then return (len will be zero if compress succeeded, otherwise negative)
8722 bind(reset_sp);
8723 addl(result, len);
8724
8725 bind(done);
8726 }
8727
8728 // Inflate byte[] array to char[].
8729 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8730 // @IntrinsicCandidate
8731 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8732 // for (int i = 0; i < len; i++) {
8733 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8734 // }
8735 // }
8736 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8737 XMMRegister tmp1, Register tmp2, KRegister mask) {
8738 Label copy_chars_loop, done, below_threshold, avx3_threshold;
8739 // rsi: src
8740 // rdi: dst
8741 // rdx: len
8742 // rcx: tmp2
8743
8744 // rsi holds start addr of source byte[] to be inflated
8745 // rdi holds start addr of destination char[]
8746 // rdx holds length
8747 assert_different_registers(src, dst, len, tmp2);
8748 movl(tmp2, len);
8749 if ((UseAVX > 2) && // AVX512
8750 VM_Version::supports_avx512vlbw() &&
8751 VM_Version::supports_bmi2()) {
8752
8753 Label copy_32_loop, copy_tail;
8754 Register tmp3_aliased = len;
8755
8756 // if length of the string is less than 16, handle it in an old fashioned way
8757 testl(len, -16);
8758 jcc(Assembler::zero, below_threshold);
8759
8760 testl(len, -1 * AVX3Threshold);
8761 jcc(Assembler::zero, avx3_threshold);
8762
8763 // In order to use only one arithmetic operation for the main loop we use
8764 // this pre-calculation
8765 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8766 andl(len, -32); // vector count
8767 jccb(Assembler::zero, copy_tail);
8768
8769 lea(src, Address(src, len, Address::times_1));
8770 lea(dst, Address(dst, len, Address::times_2));
8771 negptr(len);
8772
8773
8774 // inflate 32 chars per iter
8775 bind(copy_32_loop);
8776 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8777 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
8778 addptr(len, 32);
8779 jcc(Assembler::notZero, copy_32_loop);
8780
8781 bind(copy_tail);
8782 // bail out when there is nothing to be done
8783 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8784 jcc(Assembler::zero, done);
8785
8786 // ~(~0 << length), where length is the # of remaining elements to process
8787 movl(tmp3_aliased, -1);
8788 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8789 notl(tmp3_aliased);
8790 kmovdl(mask, tmp3_aliased);
8791 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8792 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8793
8794 jmp(done);
8795 bind(avx3_threshold);
8796 }
8797 if (UseSSE42Intrinsics) {
8798 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8799
8800 if (UseAVX > 1) {
8801 andl(tmp2, (16 - 1));
8802 andl(len, -16);
8803 jccb(Assembler::zero, copy_new_tail);
8804 } else {
8805 andl(tmp2, 0x00000007); // tail count (in chars)
8806 andl(len, 0xfffffff8); // vector count (in chars)
8807 jccb(Assembler::zero, copy_tail);
8808 }
8809
8810 // vectored inflation
8811 lea(src, Address(src, len, Address::times_1));
8812 lea(dst, Address(dst, len, Address::times_2));
8813 negptr(len);
8814
8815 if (UseAVX > 1) {
8816 bind(copy_16_loop);
8817 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8818 vmovdqu(Address(dst, len, Address::times_2), tmp1);
8819 addptr(len, 16);
8820 jcc(Assembler::notZero, copy_16_loop);
8821
8822 bind(below_threshold);
8823 bind(copy_new_tail);
8824 movl(len, tmp2);
8825 andl(tmp2, 0x00000007);
8826 andl(len, 0xFFFFFFF8);
8827 jccb(Assembler::zero, copy_tail);
8828
8829 pmovzxbw(tmp1, Address(src, 0));
8830 movdqu(Address(dst, 0), tmp1);
8831 addptr(src, 8);
8832 addptr(dst, 2 * 8);
8833
8834 jmp(copy_tail, true);
8835 }
8836
8837 // inflate 8 chars per iter
8838 bind(copy_8_loop);
8839 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
8840 movdqu(Address(dst, len, Address::times_2), tmp1);
8841 addptr(len, 8);
8842 jcc(Assembler::notZero, copy_8_loop);
8843
8844 bind(copy_tail);
8845 movl(len, tmp2);
8846
8847 cmpl(len, 4);
8848 jccb(Assembler::less, copy_bytes);
8849
8850 movdl(tmp1, Address(src, 0)); // load 4 byte chars
8851 pmovzxbw(tmp1, tmp1);
8852 movq(Address(dst, 0), tmp1);
8853 subptr(len, 4);
8854 addptr(src, 4);
8855 addptr(dst, 8);
8856
8857 bind(copy_bytes);
8858 } else {
8859 bind(below_threshold);
8860 }
8861
8862 testl(len, len);
8863 jccb(Assembler::zero, done);
8864 lea(src, Address(src, len, Address::times_1));
8865 lea(dst, Address(dst, len, Address::times_2));
8866 negptr(len);
8867
8868 // inflate 1 char per iter
8869 bind(copy_chars_loop);
8870 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
8871 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
8872 increment(len);
8873 jcc(Assembler::notZero, copy_chars_loop);
8874
8875 bind(done);
8876 }
8877
8878 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
8879 switch(type) {
8880 case T_BYTE:
8881 case T_BOOLEAN:
8882 evmovdqub(dst, kmask, src, merge, vector_len);
8883 break;
8884 case T_CHAR:
8885 case T_SHORT:
8886 evmovdquw(dst, kmask, src, merge, vector_len);
8887 break;
8888 case T_INT:
8889 case T_FLOAT:
8890 evmovdqul(dst, kmask, src, merge, vector_len);
8891 break;
8892 case T_LONG:
8893 case T_DOUBLE:
8894 evmovdquq(dst, kmask, src, merge, vector_len);
8895 break;
8896 default:
8897 fatal("Unexpected type argument %s", type2name(type));
8898 break;
8899 }
8900 }
8901
8902
8903 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
8904 switch(type) {
8905 case T_BYTE:
8906 case T_BOOLEAN:
8907 evmovdqub(dst, kmask, src, merge, vector_len);
8908 break;
8909 case T_CHAR:
8910 case T_SHORT:
8911 evmovdquw(dst, kmask, src, merge, vector_len);
8912 break;
8913 case T_INT:
8914 case T_FLOAT:
8915 evmovdqul(dst, kmask, src, merge, vector_len);
8916 break;
8917 case T_LONG:
8918 case T_DOUBLE:
8919 evmovdquq(dst, kmask, src, merge, vector_len);
8920 break;
8921 default:
8922 fatal("Unexpected type argument %s", type2name(type));
8923 break;
8924 }
8925 }
8926
8927 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
8928 switch(type) {
8929 case T_BYTE:
8930 case T_BOOLEAN:
8931 evmovdqub(dst, kmask, src, merge, vector_len);
8932 break;
8933 case T_CHAR:
8934 case T_SHORT:
8935 evmovdquw(dst, kmask, src, merge, vector_len);
8936 break;
8937 case T_INT:
8938 case T_FLOAT:
8939 evmovdqul(dst, kmask, src, merge, vector_len);
8940 break;
8941 case T_LONG:
8942 case T_DOUBLE:
8943 evmovdquq(dst, kmask, src, merge, vector_len);
8944 break;
8945 default:
8946 fatal("Unexpected type argument %s", type2name(type));
8947 break;
8948 }
8949 }
8950
8951 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8952 switch(masklen) {
8953 case 2:
8954 knotbl(dst, src);
8955 movl(rtmp, 3);
8956 kmovbl(ktmp, rtmp);
8957 kandbl(dst, ktmp, dst);
8958 break;
8959 case 4:
8960 knotbl(dst, src);
8961 movl(rtmp, 15);
8962 kmovbl(ktmp, rtmp);
8963 kandbl(dst, ktmp, dst);
8964 break;
8965 case 8:
8966 knotbl(dst, src);
8967 break;
8968 case 16:
8969 knotwl(dst, src);
8970 break;
8971 case 32:
8972 knotdl(dst, src);
8973 break;
8974 case 64:
8975 knotql(dst, src);
8976 break;
8977 default:
8978 fatal("Unexpected vector length %d", masklen);
8979 break;
8980 }
8981 }
8982
8983 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8984 switch(type) {
8985 case T_BOOLEAN:
8986 case T_BYTE:
8987 kandbl(dst, src1, src2);
8988 break;
8989 case T_CHAR:
8990 case T_SHORT:
8991 kandwl(dst, src1, src2);
8992 break;
8993 case T_INT:
8994 case T_FLOAT:
8995 kanddl(dst, src1, src2);
8996 break;
8997 case T_LONG:
8998 case T_DOUBLE:
8999 kandql(dst, src1, src2);
9000 break;
9001 default:
9002 fatal("Unexpected type argument %s", type2name(type));
9003 break;
9004 }
9005 }
9006
9007 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9008 switch(type) {
9009 case T_BOOLEAN:
9010 case T_BYTE:
9011 korbl(dst, src1, src2);
9012 break;
9013 case T_CHAR:
9014 case T_SHORT:
9015 korwl(dst, src1, src2);
9016 break;
9017 case T_INT:
9018 case T_FLOAT:
9019 kordl(dst, src1, src2);
9020 break;
9021 case T_LONG:
9022 case T_DOUBLE:
9023 korql(dst, src1, src2);
9024 break;
9025 default:
9026 fatal("Unexpected type argument %s", type2name(type));
9027 break;
9028 }
9029 }
9030
9031 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9032 switch(type) {
9033 case T_BOOLEAN:
9034 case T_BYTE:
9035 kxorbl(dst, src1, src2);
9036 break;
9037 case T_CHAR:
9038 case T_SHORT:
9039 kxorwl(dst, src1, src2);
9040 break;
9041 case T_INT:
9042 case T_FLOAT:
9043 kxordl(dst, src1, src2);
9044 break;
9045 case T_LONG:
9046 case T_DOUBLE:
9047 kxorql(dst, src1, src2);
9048 break;
9049 default:
9050 fatal("Unexpected type argument %s", type2name(type));
9051 break;
9052 }
9053 }
9054
9055 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9056 switch(type) {
9057 case T_BOOLEAN:
9058 case T_BYTE:
9059 evpermb(dst, mask, nds, src, merge, vector_len); break;
9060 case T_CHAR:
9061 case T_SHORT:
9062 evpermw(dst, mask, nds, src, merge, vector_len); break;
9063 case T_INT:
9064 case T_FLOAT:
9065 evpermd(dst, mask, nds, src, merge, vector_len); break;
9066 case T_LONG:
9067 case T_DOUBLE:
9068 evpermq(dst, mask, nds, src, merge, vector_len); break;
9069 default:
9070 fatal("Unexpected type argument %s", type2name(type)); break;
9071 }
9072 }
9073
9074 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9075 switch(type) {
9076 case T_BOOLEAN:
9077 case T_BYTE:
9078 evpermb(dst, mask, nds, src, merge, vector_len); break;
9079 case T_CHAR:
9080 case T_SHORT:
9081 evpermw(dst, mask, nds, src, merge, vector_len); break;
9082 case T_INT:
9083 case T_FLOAT:
9084 evpermd(dst, mask, nds, src, merge, vector_len); break;
9085 case T_LONG:
9086 case T_DOUBLE:
9087 evpermq(dst, mask, nds, src, merge, vector_len); break;
9088 default:
9089 fatal("Unexpected type argument %s", type2name(type)); break;
9090 }
9091 }
9092
9093 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9094 switch(type) {
9095 case T_BYTE:
9096 evpminub(dst, mask, nds, src, merge, vector_len); break;
9097 case T_SHORT:
9098 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9099 case T_INT:
9100 evpminud(dst, mask, nds, src, merge, vector_len); break;
9101 case T_LONG:
9102 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9103 default:
9104 fatal("Unexpected type argument %s", type2name(type)); break;
9105 }
9106 }
9107
9108 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9109 switch(type) {
9110 case T_BYTE:
9111 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9112 case T_SHORT:
9113 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9114 case T_INT:
9115 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9116 case T_LONG:
9117 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9118 default:
9119 fatal("Unexpected type argument %s", type2name(type)); break;
9120 }
9121 }
9122
9123 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9124 switch(type) {
9125 case T_BYTE:
9126 evpminub(dst, mask, nds, src, merge, vector_len); break;
9127 case T_SHORT:
9128 evpminuw(dst, mask, nds, src, merge, vector_len); break;
9129 case T_INT:
9130 evpminud(dst, mask, nds, src, merge, vector_len); break;
9131 case T_LONG:
9132 evpminuq(dst, mask, nds, src, merge, vector_len); break;
9133 default:
9134 fatal("Unexpected type argument %s", type2name(type)); break;
9135 }
9136 }
9137
9138 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9139 switch(type) {
9140 case T_BYTE:
9141 evpmaxub(dst, mask, nds, src, merge, vector_len); break;
9142 case T_SHORT:
9143 evpmaxuw(dst, mask, nds, src, merge, vector_len); break;
9144 case T_INT:
9145 evpmaxud(dst, mask, nds, src, merge, vector_len); break;
9146 case T_LONG:
9147 evpmaxuq(dst, mask, nds, src, merge, vector_len); break;
9148 default:
9149 fatal("Unexpected type argument %s", type2name(type)); break;
9150 }
9151 }
9152
9153 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9154 switch(type) {
9155 case T_BYTE:
9156 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9157 case T_SHORT:
9158 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9159 case T_INT:
9160 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9161 case T_LONG:
9162 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9163 case T_FLOAT:
9164 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9165 case T_DOUBLE:
9166 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9167 default:
9168 fatal("Unexpected type argument %s", type2name(type)); break;
9169 }
9170 }
9171
9172 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9173 switch(type) {
9174 case T_BYTE:
9175 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9176 case T_SHORT:
9177 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9178 case T_INT:
9179 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9180 case T_LONG:
9181 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9182 case T_FLOAT:
9183 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9184 case T_DOUBLE:
9185 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9186 default:
9187 fatal("Unexpected type argument %s", type2name(type)); break;
9188 }
9189 }
9190
9191 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9192 switch(type) {
9193 case T_BYTE:
9194 evpminsb(dst, mask, nds, src, merge, vector_len); break;
9195 case T_SHORT:
9196 evpminsw(dst, mask, nds, src, merge, vector_len); break;
9197 case T_INT:
9198 evpminsd(dst, mask, nds, src, merge, vector_len); break;
9199 case T_LONG:
9200 evpminsq(dst, mask, nds, src, merge, vector_len); break;
9201 case T_FLOAT:
9202 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9203 case T_DOUBLE:
9204 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
9205 default:
9206 fatal("Unexpected type argument %s", type2name(type)); break;
9207 }
9208 }
9209
9210 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9211 switch(type) {
9212 case T_BYTE:
9213 evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9214 case T_SHORT:
9215 evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9216 case T_INT:
9217 evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9218 case T_LONG:
9219 evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9220 case T_FLOAT:
9221 evminmaxps(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9222 case T_DOUBLE:
9223 evminmaxpd(dst, mask, nds, src, merge, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
9224 default:
9225 fatal("Unexpected type argument %s", type2name(type)); break;
9226 }
9227 }
9228
9229 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9230 switch(type) {
9231 case T_INT:
9232 evpxord(dst, mask, nds, src, merge, vector_len); break;
9233 case T_LONG:
9234 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9235 default:
9236 fatal("Unexpected type argument %s", type2name(type)); break;
9237 }
9238 }
9239
9240 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9241 switch(type) {
9242 case T_INT:
9243 evpxord(dst, mask, nds, src, merge, vector_len); break;
9244 case T_LONG:
9245 evpxorq(dst, mask, nds, src, merge, vector_len); break;
9246 default:
9247 fatal("Unexpected type argument %s", type2name(type)); break;
9248 }
9249 }
9250
9251 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9252 switch(type) {
9253 case T_INT:
9254 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9255 case T_LONG:
9256 evporq(dst, mask, nds, src, merge, vector_len); break;
9257 default:
9258 fatal("Unexpected type argument %s", type2name(type)); break;
9259 }
9260 }
9261
9262 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9263 switch(type) {
9264 case T_INT:
9265 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9266 case T_LONG:
9267 evporq(dst, mask, nds, src, merge, vector_len); break;
9268 default:
9269 fatal("Unexpected type argument %s", type2name(type)); break;
9270 }
9271 }
9272
9273 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9274 switch(type) {
9275 case T_INT:
9276 evpandd(dst, mask, nds, src, merge, vector_len); break;
9277 case T_LONG:
9278 evpandq(dst, mask, nds, src, merge, vector_len); break;
9279 default:
9280 fatal("Unexpected type argument %s", type2name(type)); break;
9281 }
9282 }
9283
9284 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9285 switch(type) {
9286 case T_INT:
9287 evpandd(dst, mask, nds, src, merge, vector_len); break;
9288 case T_LONG:
9289 evpandq(dst, mask, nds, src, merge, vector_len); break;
9290 default:
9291 fatal("Unexpected type argument %s", type2name(type)); break;
9292 }
9293 }
9294
9295 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
9296 switch(masklen) {
9297 case 8:
9298 kortestbl(src1, src2);
9299 break;
9300 case 16:
9301 kortestwl(src1, src2);
9302 break;
9303 case 32:
9304 kortestdl(src1, src2);
9305 break;
9306 case 64:
9307 kortestql(src1, src2);
9308 break;
9309 default:
9310 fatal("Unexpected mask length %d", masklen);
9311 break;
9312 }
9313 }
9314
9315
9316 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9317 switch(masklen) {
9318 case 8:
9319 ktestbl(src1, src2);
9320 break;
9321 case 16:
9322 ktestwl(src1, src2);
9323 break;
9324 case 32:
9325 ktestdl(src1, src2);
9326 break;
9327 case 64:
9328 ktestql(src1, src2);
9329 break;
9330 default:
9331 fatal("Unexpected mask length %d", masklen);
9332 break;
9333 }
9334 }
9335
9336 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9337 switch(type) {
9338 case T_INT:
9339 evprold(dst, mask, src, shift, merge, vlen_enc); break;
9340 case T_LONG:
9341 evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9342 default:
9343 fatal("Unexpected type argument %s", type2name(type)); break;
9344 break;
9345 }
9346 }
9347
9348 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9349 switch(type) {
9350 case T_INT:
9351 evprord(dst, mask, src, shift, merge, vlen_enc); break;
9352 case T_LONG:
9353 evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9354 default:
9355 fatal("Unexpected type argument %s", type2name(type)); break;
9356 }
9357 }
9358
9359 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9360 switch(type) {
9361 case T_INT:
9362 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9363 case T_LONG:
9364 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9365 default:
9366 fatal("Unexpected type argument %s", type2name(type)); break;
9367 }
9368 }
9369
9370 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9371 switch(type) {
9372 case T_INT:
9373 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9374 case T_LONG:
9375 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9376 default:
9377 fatal("Unexpected type argument %s", type2name(type)); break;
9378 }
9379 }
9380
9381 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9382 assert(rscratch != noreg || always_reachable(src), "missing");
9383
9384 if (reachable(src)) {
9385 evpandq(dst, nds, as_Address(src), vector_len);
9386 } else {
9387 lea(rscratch, src);
9388 evpandq(dst, nds, Address(rscratch, 0), vector_len);
9389 }
9390 }
9391
9392 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9393 assert(rscratch != noreg || always_reachable(src), "missing");
9394
9395 if (reachable(src)) {
9396 Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9397 } else {
9398 lea(rscratch, src);
9399 Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9400 }
9401 }
9402
9403 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9404 assert(rscratch != noreg || always_reachable(src), "missing");
9405
9406 if (reachable(src)) {
9407 evporq(dst, nds, as_Address(src), vector_len);
9408 } else {
9409 lea(rscratch, src);
9410 evporq(dst, nds, Address(rscratch, 0), vector_len);
9411 }
9412 }
9413
9414 void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9415 assert(rscratch != noreg || always_reachable(src), "missing");
9416
9417 if (reachable(src)) {
9418 vpshufb(dst, nds, as_Address(src), vector_len);
9419 } else {
9420 lea(rscratch, src);
9421 vpshufb(dst, nds, Address(rscratch, 0), vector_len);
9422 }
9423 }
9424
9425 void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9426 assert(rscratch != noreg || always_reachable(src), "missing");
9427
9428 if (reachable(src)) {
9429 Assembler::vpor(dst, nds, as_Address(src), vector_len);
9430 } else {
9431 lea(rscratch, src);
9432 Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len);
9433 }
9434 }
9435
9436 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
9437 assert(rscratch != noreg || always_reachable(src3), "missing");
9438
9439 if (reachable(src3)) {
9440 vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
9441 } else {
9442 lea(rscratch, src3);
9443 vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
9444 }
9445 }
9446
9447 #if COMPILER2_OR_JVMCI
9448
9449 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9450 Register length, Register temp, int vec_enc) {
9451 // Computing mask for predicated vector store.
9452 movptr(temp, -1);
9453 bzhiq(temp, temp, length);
9454 kmov(mask, temp);
9455 evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9456 }
9457
9458 // Set memory operation for length "less than" 64 bytes.
9459 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9460 XMMRegister xmm, KRegister mask, Register length,
9461 Register temp, bool use64byteVector) {
9462 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9463 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9464 if (!use64byteVector) {
9465 fill32(dst, disp, xmm);
9466 subptr(length, 32 >> shift);
9467 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9468 } else {
9469 assert(MaxVectorSize == 64, "vector length != 64");
9470 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9471 }
9472 }
9473
9474
9475 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9476 XMMRegister xmm, KRegister mask, Register length,
9477 Register temp) {
9478 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9479 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9480 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9481 }
9482
9483
9484 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9485 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9486 vmovdqu(dst, xmm);
9487 }
9488
9489 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9490 fill32(Address(dst, disp), xmm);
9491 }
9492
9493 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9494 assert(MaxVectorSize >= 32, "vector length should be >= 32");
9495 if (!use64byteVector) {
9496 fill32(dst, xmm);
9497 fill32(dst.plus_disp(32), xmm);
9498 } else {
9499 evmovdquq(dst, xmm, Assembler::AVX_512bit);
9500 }
9501 }
9502
9503 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9504 fill64(Address(dst, disp), xmm, use64byteVector);
9505 }
9506
9507 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9508 Register count, Register rtmp, XMMRegister xtmp) {
9509 Label L_exit;
9510 Label L_fill_start;
9511 Label L_fill_64_bytes;
9512 Label L_fill_96_bytes;
9513 Label L_fill_128_bytes;
9514 Label L_fill_128_bytes_loop;
9515 Label L_fill_128_loop_header;
9516 Label L_fill_128_bytes_loop_header;
9517 Label L_fill_128_bytes_loop_pre_header;
9518 Label L_fill_zmm_sequence;
9519
9520 int shift = -1;
9521 switch(type) {
9522 case T_BYTE: shift = 0;
9523 break;
9524 case T_SHORT: shift = 1;
9525 break;
9526 case T_INT: shift = 2;
9527 break;
9528 /* Uncomment when LONG fill stubs are supported.
9529 case T_LONG: shift = 3;
9530 break;
9531 */
9532 default:
9533 fatal("Unhandled type: %s\n", type2name(type));
9534 }
9535
9536 if ((CopyAVX3Threshold != 0) || (MaxVectorSize == 32)) {
9537
9538 if (MaxVectorSize == 64) {
9539 cmpq(count, CopyAVX3Threshold >> shift);
9540 jcc(Assembler::greater, L_fill_zmm_sequence);
9541 }
9542
9543 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
9544
9545 bind(L_fill_start);
9546
9547 cmpq(count, 32 >> shift);
9548 jccb(Assembler::greater, L_fill_64_bytes);
9549 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
9550 jmp(L_exit);
9551
9552 bind(L_fill_64_bytes);
9553 cmpq(count, 64 >> shift);
9554 jccb(Assembler::greater, L_fill_96_bytes);
9555 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
9556 jmp(L_exit);
9557
9558 bind(L_fill_96_bytes);
9559 cmpq(count, 96 >> shift);
9560 jccb(Assembler::greater, L_fill_128_bytes);
9561 fill64(to, 0, xtmp);
9562 subq(count, 64 >> shift);
9563 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
9564 jmp(L_exit);
9565
9566 bind(L_fill_128_bytes);
9567 cmpq(count, 128 >> shift);
9568 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
9569 fill64(to, 0, xtmp);
9570 fill32(to, 64, xtmp);
9571 subq(count, 96 >> shift);
9572 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
9573 jmp(L_exit);
9574
9575 bind(L_fill_128_bytes_loop_pre_header);
9576 {
9577 mov(rtmp, to);
9578 andq(rtmp, 31);
9579 jccb(Assembler::zero, L_fill_128_bytes_loop_header);
9580 negq(rtmp);
9581 addq(rtmp, 32);
9582 mov64(r8, -1L);
9583 bzhiq(r8, r8, rtmp);
9584 kmovql(k2, r8);
9585 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
9586 addq(to, rtmp);
9587 shrq(rtmp, shift);
9588 subq(count, rtmp);
9589 }
9590
9591 cmpq(count, 128 >> shift);
9592 jcc(Assembler::less, L_fill_start);
9593
9594 bind(L_fill_128_bytes_loop_header);
9595 subq(count, 128 >> shift);
9596
9597 align32();
9598 bind(L_fill_128_bytes_loop);
9599 fill64(to, 0, xtmp);
9600 fill64(to, 64, xtmp);
9601 addq(to, 128);
9602 subq(count, 128 >> shift);
9603 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9604
9605 addq(count, 128 >> shift);
9606 jcc(Assembler::zero, L_exit);
9607 jmp(L_fill_start);
9608 }
9609
9610 if (MaxVectorSize == 64) {
9611 // Sequence using 64 byte ZMM register.
9612 Label L_fill_128_bytes_zmm;
9613 Label L_fill_192_bytes_zmm;
9614 Label L_fill_192_bytes_loop_zmm;
9615 Label L_fill_192_bytes_loop_header_zmm;
9616 Label L_fill_192_bytes_loop_pre_header_zmm;
9617 Label L_fill_start_zmm_sequence;
9618
9619 bind(L_fill_zmm_sequence);
9620 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9621
9622 bind(L_fill_start_zmm_sequence);
9623 cmpq(count, 64 >> shift);
9624 jccb(Assembler::greater, L_fill_128_bytes_zmm);
9625 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9626 jmp(L_exit);
9627
9628 bind(L_fill_128_bytes_zmm);
9629 cmpq(count, 128 >> shift);
9630 jccb(Assembler::greater, L_fill_192_bytes_zmm);
9631 fill64(to, 0, xtmp, true);
9632 subq(count, 64 >> shift);
9633 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9634 jmp(L_exit);
9635
9636 bind(L_fill_192_bytes_zmm);
9637 cmpq(count, 192 >> shift);
9638 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9639 fill64(to, 0, xtmp, true);
9640 fill64(to, 64, xtmp, true);
9641 subq(count, 128 >> shift);
9642 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9643 jmp(L_exit);
9644
9645 bind(L_fill_192_bytes_loop_pre_header_zmm);
9646 {
9647 movq(rtmp, to);
9648 andq(rtmp, 63);
9649 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9650 negq(rtmp);
9651 addq(rtmp, 64);
9652 mov64(r8, -1L);
9653 bzhiq(r8, r8, rtmp);
9654 kmovql(k2, r8);
9655 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9656 addq(to, rtmp);
9657 shrq(rtmp, shift);
9658 subq(count, rtmp);
9659 }
9660
9661 cmpq(count, 192 >> shift);
9662 jcc(Assembler::less, L_fill_start_zmm_sequence);
9663
9664 bind(L_fill_192_bytes_loop_header_zmm);
9665 subq(count, 192 >> shift);
9666
9667 align32();
9668 bind(L_fill_192_bytes_loop_zmm);
9669 fill64(to, 0, xtmp, true);
9670 fill64(to, 64, xtmp, true);
9671 fill64(to, 128, xtmp, true);
9672 addq(to, 192);
9673 subq(count, 192 >> shift);
9674 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9675
9676 addq(count, 192 >> shift);
9677 jcc(Assembler::zero, L_exit);
9678 jmp(L_fill_start_zmm_sequence);
9679 }
9680 bind(L_exit);
9681 }
9682 #endif //COMPILER2_OR_JVMCI
9683
9684
9685 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9686 Label done;
9687 cvttss2sil(dst, src);
9688 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9689 cmpl(dst, 0x80000000); // float_sign_flip
9690 jccb(Assembler::notEqual, done);
9691 subptr(rsp, 8);
9692 movflt(Address(rsp, 0), src);
9693 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9694 pop(dst);
9695 bind(done);
9696 }
9697
9698 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9699 Label done;
9700 cvttsd2sil(dst, src);
9701 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9702 cmpl(dst, 0x80000000); // float_sign_flip
9703 jccb(Assembler::notEqual, done);
9704 subptr(rsp, 8);
9705 movdbl(Address(rsp, 0), src);
9706 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9707 pop(dst);
9708 bind(done);
9709 }
9710
9711 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9712 Label done;
9713 cvttss2siq(dst, src);
9714 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9715 jccb(Assembler::notEqual, done);
9716 subptr(rsp, 8);
9717 movflt(Address(rsp, 0), src);
9718 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9719 pop(dst);
9720 bind(done);
9721 }
9722
9723 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9724 // Following code is line by line assembly translation rounding algorithm.
9725 // Please refer to java.lang.Math.round(float) algorithm for details.
9726 const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
9727 const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
9728 const int32_t FloatConsts_EXP_BIAS = 127;
9729 const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
9730 const int32_t MINUS_32 = 0xFFFFFFE0;
9731 Label L_special_case, L_block1, L_exit;
9732 movl(rtmp, FloatConsts_EXP_BIT_MASK);
9733 movdl(dst, src);
9734 andl(dst, rtmp);
9735 sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
9736 movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
9737 subl(rtmp, dst);
9738 movl(rcx, rtmp);
9739 movl(dst, MINUS_32);
9740 testl(rtmp, dst);
9741 jccb(Assembler::notEqual, L_special_case);
9742 movdl(dst, src);
9743 andl(dst, FloatConsts_SIGNIF_BIT_MASK);
9744 orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
9745 movdl(rtmp, src);
9746 testl(rtmp, rtmp);
9747 jccb(Assembler::greaterEqual, L_block1);
9748 negl(dst);
9749 bind(L_block1);
9750 sarl(dst);
9751 addl(dst, 0x1);
9752 sarl(dst, 0x1);
9753 jmp(L_exit);
9754 bind(L_special_case);
9755 convert_f2i(dst, src);
9756 bind(L_exit);
9757 }
9758
9759 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9760 // Following code is line by line assembly translation rounding algorithm.
9761 // Please refer to java.lang.Math.round(double) algorithm for details.
9762 const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
9763 const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
9764 const int64_t DoubleConsts_EXP_BIAS = 1023;
9765 const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
9766 const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
9767 Label L_special_case, L_block1, L_exit;
9768 mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
9769 movq(dst, src);
9770 andq(dst, rtmp);
9771 sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
9772 mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
9773 subq(rtmp, dst);
9774 movq(rcx, rtmp);
9775 mov64(dst, MINUS_64);
9776 testq(rtmp, dst);
9777 jccb(Assembler::notEqual, L_special_case);
9778 movq(dst, src);
9779 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
9780 andq(dst, rtmp);
9781 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
9782 orq(dst, rtmp);
9783 movq(rtmp, src);
9784 testq(rtmp, rtmp);
9785 jccb(Assembler::greaterEqual, L_block1);
9786 negq(dst);
9787 bind(L_block1);
9788 sarq(dst);
9789 addq(dst, 0x1);
9790 sarq(dst, 0x1);
9791 jmp(L_exit);
9792 bind(L_special_case);
9793 convert_d2l(dst, src);
9794 bind(L_exit);
9795 }
9796
9797 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9798 Label done;
9799 cvttsd2siq(dst, src);
9800 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9801 jccb(Assembler::notEqual, done);
9802 subptr(rsp, 8);
9803 movdbl(Address(rsp, 0), src);
9804 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9805 pop(dst);
9806 bind(done);
9807 }
9808
9809 void MacroAssembler::cache_wb(Address line)
9810 {
9811 // 64 bit cpus always support clflush
9812 bool optimized = VM_Version::supports_clflushopt();
9813 bool no_evict = VM_Version::supports_clwb();
9814
9815 // prefer clwb (writeback without evict) otherwise
9816 // prefer clflushopt (potentially parallel writeback with evict)
9817 // otherwise fallback on clflush (serial writeback with evict)
9818
9819 if (optimized) {
9820 if (no_evict) {
9821 clwb(line);
9822 } else {
9823 clflushopt(line);
9824 }
9825 } else {
9826 // no need for fence when using CLFLUSH
9827 clflush(line);
9828 }
9829 }
9830
9831 void MacroAssembler::cache_wbsync(bool is_pre)
9832 {
9833 bool optimized = VM_Version::supports_clflushopt();
9834 bool no_evict = VM_Version::supports_clwb();
9835
9836 // pick the correct implementation
9837
9838 if (!is_pre && (optimized || no_evict)) {
9839 // need an sfence for post flush when using clflushopt or clwb
9840 // otherwise no no need for any synchroniaztion
9841
9842 sfence();
9843 }
9844 }
9845
9846 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9847 switch (cond) {
9848 // Note some conditions are synonyms for others
9849 case Assembler::zero: return Assembler::notZero;
9850 case Assembler::notZero: return Assembler::zero;
9851 case Assembler::less: return Assembler::greaterEqual;
9852 case Assembler::lessEqual: return Assembler::greater;
9853 case Assembler::greater: return Assembler::lessEqual;
9854 case Assembler::greaterEqual: return Assembler::less;
9855 case Assembler::below: return Assembler::aboveEqual;
9856 case Assembler::belowEqual: return Assembler::above;
9857 case Assembler::above: return Assembler::belowEqual;
9858 case Assembler::aboveEqual: return Assembler::below;
9859 case Assembler::overflow: return Assembler::noOverflow;
9860 case Assembler::noOverflow: return Assembler::overflow;
9861 case Assembler::negative: return Assembler::positive;
9862 case Assembler::positive: return Assembler::negative;
9863 case Assembler::parity: return Assembler::noParity;
9864 case Assembler::noParity: return Assembler::parity;
9865 }
9866 ShouldNotReachHere(); return Assembler::overflow;
9867 }
9868
9869 // This is simply a call to Thread::current()
9870 void MacroAssembler::get_thread_slow(Register thread) {
9871 if (thread != rax) {
9872 push(rax);
9873 }
9874 push(rdi);
9875 push(rsi);
9876 push(rdx);
9877 push(rcx);
9878 push(r8);
9879 push(r9);
9880 push(r10);
9881 push(r11);
9882
9883 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9884
9885 pop(r11);
9886 pop(r10);
9887 pop(r9);
9888 pop(r8);
9889 pop(rcx);
9890 pop(rdx);
9891 pop(rsi);
9892 pop(rdi);
9893 if (thread != rax) {
9894 mov(thread, rax);
9895 pop(rax);
9896 }
9897 }
9898
9899 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
9900 Label L_stack_ok;
9901 if (bias == 0) {
9902 testptr(sp, 2 * wordSize - 1);
9903 } else {
9904 // lea(tmp, Address(rsp, bias);
9905 mov(tmp, sp);
9906 addptr(tmp, bias);
9907 testptr(tmp, 2 * wordSize - 1);
9908 }
9909 jcc(Assembler::equal, L_stack_ok);
9910 block_comment(msg);
9911 stop(msg);
9912 bind(L_stack_ok);
9913 }
9914
9915 // Implements fast-locking.
9916 //
9917 // obj: the object to be locked
9918 // reg_rax: rax
9919 // thread: the thread which attempts to lock obj
9920 // tmp: a temporary register
9921 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register reg_rax, Register tmp, Label& slow) {
9922 Register thread = r15_thread;
9923
9924 assert(reg_rax == rax, "");
9925 assert_different_registers(basic_lock, obj, reg_rax, thread, tmp);
9926
9927 Label push;
9928 const Register top = tmp;
9929
9930 // Preload the markWord. It is important that this is the first
9931 // instruction emitted as it is part of C1's null check semantics.
9932 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
9933
9934 if (UseObjectMonitorTable) {
9935 // Clear cache in case fast locking succeeds or we need to take the slow-path.
9936 movptr(Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))), 0);
9937 }
9938
9939 if (DiagnoseSyncOnValueBasedClasses != 0) {
9940 load_klass(tmp, obj, rscratch1);
9941 testb(Address(tmp, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
9942 jcc(Assembler::notZero, slow);
9943 }
9944
9945 // Load top.
9946 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9947
9948 // Check if the lock-stack is full.
9949 cmpl(top, LockStack::end_offset());
9950 jcc(Assembler::greaterEqual, slow);
9951
9952 // Check for recursion.
9953 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
9954 jcc(Assembler::equal, push);
9955
9956 // Check header for monitor (0b10).
9957 testptr(reg_rax, markWord::monitor_value);
9958 jcc(Assembler::notZero, slow);
9959
9960 // Try to lock. Transition lock bits 0b01 => 0b00
9961 movptr(tmp, reg_rax);
9962 andptr(tmp, ~(int32_t)markWord::unlocked_value);
9963 orptr(reg_rax, markWord::unlocked_value);
9964 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
9965 jcc(Assembler::notEqual, slow);
9966
9967 // Restore top, CAS clobbers register.
9968 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9969
9970 bind(push);
9971 // After successful lock, push object on lock-stack.
9972 movptr(Address(thread, top), obj);
9973 incrementl(top, oopSize);
9974 movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
9975 }
9976
9977 // Implements fast-unlocking.
9978 //
9979 // obj: the object to be unlocked
9980 // reg_rax: rax
9981 // thread: the thread
9982 // tmp: a temporary register
9983 void MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register tmp, Label& slow) {
9984 Register thread = r15_thread;
9985
9986 assert(reg_rax == rax, "");
9987 assert_different_registers(obj, reg_rax, thread, tmp);
9988
9989 Label unlocked, push_and_slow;
9990 const Register top = tmp;
9991
9992 // Check if obj is top of lock-stack.
9993 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9994 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
9995 jcc(Assembler::notEqual, slow);
9996
9997 // Pop lock-stack.
9998 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
9999 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10000
10001 // Check if recursive.
10002 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
10003 jcc(Assembler::equal, unlocked);
10004
10005 // Not recursive. Check header for monitor (0b10).
10006 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10007 testptr(reg_rax, markWord::monitor_value);
10008 jcc(Assembler::notZero, push_and_slow);
10009
10010 #ifdef ASSERT
10011 // Check header not unlocked (0b01).
10012 Label not_unlocked;
10013 testptr(reg_rax, markWord::unlocked_value);
10014 jcc(Assembler::zero, not_unlocked);
10015 stop("fast_unlock already unlocked");
10016 bind(not_unlocked);
10017 #endif
10018
10019 // Try to unlock. Transition lock bits 0b00 => 0b01
10020 movptr(tmp, reg_rax);
10021 orptr(tmp, markWord::unlocked_value);
10022 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10023 jcc(Assembler::equal, unlocked);
10024
10025 bind(push_and_slow);
10026 // Restore lock-stack and handle the unlock in runtime.
10027 #ifdef ASSERT
10028 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10029 movptr(Address(thread, top), obj);
10030 #endif
10031 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10032 jmp(slow);
10033
10034 bind(unlocked);
10035 }
10036
10037 // Saves legacy GPRs state on stack.
10038 void MacroAssembler::save_legacy_gprs() {
10039 subq(rsp, 16 * wordSize);
10040 movq(Address(rsp, 15 * wordSize), rax);
10041 movq(Address(rsp, 14 * wordSize), rcx);
10042 movq(Address(rsp, 13 * wordSize), rdx);
10043 movq(Address(rsp, 12 * wordSize), rbx);
10044 movq(Address(rsp, 10 * wordSize), rbp);
10045 movq(Address(rsp, 9 * wordSize), rsi);
10046 movq(Address(rsp, 8 * wordSize), rdi);
10047 movq(Address(rsp, 7 * wordSize), r8);
10048 movq(Address(rsp, 6 * wordSize), r9);
10049 movq(Address(rsp, 5 * wordSize), r10);
10050 movq(Address(rsp, 4 * wordSize), r11);
10051 movq(Address(rsp, 3 * wordSize), r12);
10052 movq(Address(rsp, 2 * wordSize), r13);
10053 movq(Address(rsp, wordSize), r14);
10054 movq(Address(rsp, 0), r15);
10055 }
10056
10057 // Resotres back legacy GPRs state from stack.
10058 void MacroAssembler::restore_legacy_gprs() {
10059 movq(r15, Address(rsp, 0));
10060 movq(r14, Address(rsp, wordSize));
10061 movq(r13, Address(rsp, 2 * wordSize));
10062 movq(r12, Address(rsp, 3 * wordSize));
10063 movq(r11, Address(rsp, 4 * wordSize));
10064 movq(r10, Address(rsp, 5 * wordSize));
10065 movq(r9, Address(rsp, 6 * wordSize));
10066 movq(r8, Address(rsp, 7 * wordSize));
10067 movq(rdi, Address(rsp, 8 * wordSize));
10068 movq(rsi, Address(rsp, 9 * wordSize));
10069 movq(rbp, Address(rsp, 10 * wordSize));
10070 movq(rbx, Address(rsp, 12 * wordSize));
10071 movq(rdx, Address(rsp, 13 * wordSize));
10072 movq(rcx, Address(rsp, 14 * wordSize));
10073 movq(rax, Address(rsp, 15 * wordSize));
10074 addq(rsp, 16 * wordSize);
10075 }
10076
10077 void MacroAssembler::load_aotrc_address(Register reg, address a) {
10078 #if INCLUDE_CDS
10079 assert(AOTRuntimeConstants::contains(a), "address out of range for data area");
10080 if (AOTCodeCache::is_on_for_dump()) {
10081 // all aotrc field addresses should be registered in the AOTCodeCache address table
10082 lea(reg, ExternalAddress(a));
10083 } else {
10084 mov64(reg, (uint64_t)a);
10085 }
10086 #else
10087 ShouldNotReachHere();
10088 #endif
10089 }
10090
10091 void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) {
10092 if (VM_Version::supports_apx_f()) {
10093 esetzucc(comparison, dst);
10094 } else {
10095 setb(comparison, dst);
10096 movzbl(dst, dst);
10097 }
10098 }
--- EOF ---