1 /* 2 * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "jvm.h" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "c1/c1_FrameMap.hpp" 30 #include "compiler/compiler_globals.hpp" 31 #include "compiler/disassembler.hpp" 32 #include "ci/ciInlineKlass.hpp" 33 #include "gc/shared/barrierSet.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/collectedHeap.inline.hpp" 36 #include "gc/shared/tlab_globals.hpp" 37 #include "interpreter/bytecodeHistogram.hpp" 38 #include "interpreter/interpreter.hpp" 39 #include "memory/resourceArea.hpp" 40 #include "memory/universe.hpp" 41 #include "oops/accessDecorators.hpp" 42 #include "oops/compressedOops.inline.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "prims/methodHandles.hpp" 45 #include "runtime/flags/flagSetting.hpp" 46 #include "runtime/interfaceSupport.inline.hpp" 47 #include "runtime/jniHandles.hpp" 48 #include "runtime/objectMonitor.hpp" 49 #include "runtime/os.hpp" 50 #include "runtime/safepoint.hpp" 51 #include "runtime/safepointMechanism.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/signature_cc.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "runtime/thread.hpp" 56 #include "utilities/macros.hpp" 57 #include "vmreg_x86.inline.hpp" 58 #include "crc32c.h" 59 #ifdef COMPILER2 60 #include "opto/output.hpp" 61 #endif 62 63 #ifdef PRODUCT 64 #define BLOCK_COMMENT(str) /* nothing */ 65 #define STOP(error) stop(error) 66 #else 67 #define BLOCK_COMMENT(str) block_comment(str) 68 #define STOP(error) block_comment(error); stop(error) 69 #endif 70 71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 72 73 #ifdef ASSERT 74 bool AbstractAssembler::pd_check_instruction_mark() { return true; } 75 #endif 76 77 static Assembler::Condition reverse[] = { 78 Assembler::noOverflow /* overflow = 0x0 */ , 79 Assembler::overflow /* noOverflow = 0x1 */ , 80 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ , 81 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ , 82 Assembler::notZero /* zero = 0x4, equal = 0x4 */ , 83 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ , 84 Assembler::above /* belowEqual = 0x6 */ , 85 Assembler::belowEqual /* above = 0x7 */ , 86 Assembler::positive /* negative = 0x8 */ , 87 Assembler::negative /* positive = 0x9 */ , 88 Assembler::noParity /* parity = 0xa */ , 89 Assembler::parity /* noParity = 0xb */ , 90 Assembler::greaterEqual /* less = 0xc */ , 91 Assembler::less /* greaterEqual = 0xd */ , 92 Assembler::greater /* lessEqual = 0xe */ , 93 Assembler::lessEqual /* greater = 0xf, */ 94 95 }; 96 97 98 // Implementation of MacroAssembler 99 100 // First all the versions that have distinct versions depending on 32/64 bit 101 // Unless the difference is trivial (1 line or so). 102 103 #ifndef _LP64 104 105 // 32bit versions 106 107 Address MacroAssembler::as_Address(AddressLiteral adr) { 108 return Address(adr.target(), adr.rspec()); 109 } 110 111 Address MacroAssembler::as_Address(ArrayAddress adr) { 112 return Address::make_array(adr); 113 } 114 115 void MacroAssembler::call_VM_leaf_base(address entry_point, 116 int number_of_arguments) { 117 call(RuntimeAddress(entry_point)); 118 increment(rsp, number_of_arguments * wordSize); 119 } 120 121 void MacroAssembler::cmpklass(Address src1, Metadata* obj) { 122 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 123 } 124 125 126 void MacroAssembler::cmpklass(Register src1, Metadata* obj) { 127 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 128 } 129 130 void MacroAssembler::cmpoop(Address src1, jobject obj) { 131 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 132 } 133 134 void MacroAssembler::cmpoop(Register src1, jobject obj) { 135 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 136 } 137 138 void MacroAssembler::extend_sign(Register hi, Register lo) { 139 // According to Intel Doc. AP-526, "Integer Divide", p.18. 140 if (VM_Version::is_P6() && hi == rdx && lo == rax) { 141 cdql(); 142 } else { 143 movl(hi, lo); 144 sarl(hi, 31); 145 } 146 } 147 148 void MacroAssembler::jC2(Register tmp, Label& L) { 149 // set parity bit if FPU flag C2 is set (via rax) 150 save_rax(tmp); 151 fwait(); fnstsw_ax(); 152 sahf(); 153 restore_rax(tmp); 154 // branch 155 jcc(Assembler::parity, L); 156 } 157 158 void MacroAssembler::jnC2(Register tmp, Label& L) { 159 // set parity bit if FPU flag C2 is set (via rax) 160 save_rax(tmp); 161 fwait(); fnstsw_ax(); 162 sahf(); 163 restore_rax(tmp); 164 // branch 165 jcc(Assembler::noParity, L); 166 } 167 168 // 32bit can do a case table jump in one instruction but we no longer allow the base 169 // to be installed in the Address class 170 void MacroAssembler::jump(ArrayAddress entry) { 171 jmp(as_Address(entry)); 172 } 173 174 // Note: y_lo will be destroyed 175 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 176 // Long compare for Java (semantics as described in JVM spec.) 177 Label high, low, done; 178 179 cmpl(x_hi, y_hi); 180 jcc(Assembler::less, low); 181 jcc(Assembler::greater, high); 182 // x_hi is the return register 183 xorl(x_hi, x_hi); 184 cmpl(x_lo, y_lo); 185 jcc(Assembler::below, low); 186 jcc(Assembler::equal, done); 187 188 bind(high); 189 xorl(x_hi, x_hi); 190 increment(x_hi); 191 jmp(done); 192 193 bind(low); 194 xorl(x_hi, x_hi); 195 decrementl(x_hi); 196 197 bind(done); 198 } 199 200 void MacroAssembler::lea(Register dst, AddressLiteral src) { 201 mov_literal32(dst, (int32_t)src.target(), src.rspec()); 202 } 203 204 void MacroAssembler::lea(Address dst, AddressLiteral adr) { 205 // leal(dst, as_Address(adr)); 206 // see note in movl as to why we must use a move 207 mov_literal32(dst, (int32_t) adr.target(), adr.rspec()); 208 } 209 210 void MacroAssembler::leave() { 211 mov(rsp, rbp); 212 pop(rbp); 213 } 214 215 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) { 216 // Multiplication of two Java long values stored on the stack 217 // as illustrated below. Result is in rdx:rax. 218 // 219 // rsp ---> [ ?? ] \ \ 220 // .... | y_rsp_offset | 221 // [ y_lo ] / (in bytes) | x_rsp_offset 222 // [ y_hi ] | (in bytes) 223 // .... | 224 // [ x_lo ] / 225 // [ x_hi ] 226 // .... 227 // 228 // Basic idea: lo(result) = lo(x_lo * y_lo) 229 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) 230 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset); 231 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset); 232 Label quick; 233 // load x_hi, y_hi and check if quick 234 // multiplication is possible 235 movl(rbx, x_hi); 236 movl(rcx, y_hi); 237 movl(rax, rbx); 238 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0 239 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply 240 // do full multiplication 241 // 1st step 242 mull(y_lo); // x_hi * y_lo 243 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx, 244 // 2nd step 245 movl(rax, x_lo); 246 mull(rcx); // x_lo * y_hi 247 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx, 248 // 3rd step 249 bind(quick); // note: rbx, = 0 if quick multiply! 250 movl(rax, x_lo); 251 mull(y_lo); // x_lo * y_lo 252 addl(rdx, rbx); // correct hi(x_lo * y_lo) 253 } 254 255 void MacroAssembler::lneg(Register hi, Register lo) { 256 negl(lo); 257 adcl(hi, 0); 258 negl(hi); 259 } 260 261 void MacroAssembler::lshl(Register hi, Register lo) { 262 // Java shift left long support (semantics as described in JVM spec., p.305) 263 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) 264 // shift value is in rcx ! 265 assert(hi != rcx, "must not use rcx"); 266 assert(lo != rcx, "must not use rcx"); 267 const Register s = rcx; // shift count 268 const int n = BitsPerWord; 269 Label L; 270 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 271 cmpl(s, n); // if (s < n) 272 jcc(Assembler::less, L); // else (s >= n) 273 movl(hi, lo); // x := x << n 274 xorl(lo, lo); 275 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 276 bind(L); // s (mod n) < n 277 shldl(hi, lo); // x := x << s 278 shll(lo); 279 } 280 281 282 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) { 283 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) 284 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) 285 assert(hi != rcx, "must not use rcx"); 286 assert(lo != rcx, "must not use rcx"); 287 const Register s = rcx; // shift count 288 const int n = BitsPerWord; 289 Label L; 290 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 291 cmpl(s, n); // if (s < n) 292 jcc(Assembler::less, L); // else (s >= n) 293 movl(lo, hi); // x := x >> n 294 if (sign_extension) sarl(hi, 31); 295 else xorl(hi, hi); 296 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 297 bind(L); // s (mod n) < n 298 shrdl(lo, hi); // x := x >> s 299 if (sign_extension) sarl(hi); 300 else shrl(hi); 301 } 302 303 void MacroAssembler::movoop(Register dst, jobject obj) { 304 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 305 } 306 307 void MacroAssembler::movoop(Address dst, jobject obj) { 308 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 309 } 310 311 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 312 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 313 } 314 315 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) { 316 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 317 } 318 319 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { 320 // scratch register is not used, 321 // it is defined to match parameters of 64-bit version of this method. 322 if (src.is_lval()) { 323 mov_literal32(dst, (intptr_t)src.target(), src.rspec()); 324 } else { 325 movl(dst, as_Address(src)); 326 } 327 } 328 329 void MacroAssembler::movptr(ArrayAddress dst, Register src) { 330 movl(as_Address(dst), src); 331 } 332 333 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 334 movl(dst, as_Address(src)); 335 } 336 337 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 338 void MacroAssembler::movptr(Address dst, intptr_t src) { 339 movl(dst, src); 340 } 341 342 void MacroAssembler::pushoop(jobject obj) { 343 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate()); 344 } 345 346 void MacroAssembler::pushklass(Metadata* obj) { 347 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate()); 348 } 349 350 void MacroAssembler::pushptr(AddressLiteral src) { 351 if (src.is_lval()) { 352 push_literal32((int32_t)src.target(), src.rspec()); 353 } else { 354 pushl(as_Address(src)); 355 } 356 } 357 358 static void pass_arg0(MacroAssembler* masm, Register arg) { 359 masm->push(arg); 360 } 361 362 static void pass_arg1(MacroAssembler* masm, Register arg) { 363 masm->push(arg); 364 } 365 366 static void pass_arg2(MacroAssembler* masm, Register arg) { 367 masm->push(arg); 368 } 369 370 static void pass_arg3(MacroAssembler* masm, Register arg) { 371 masm->push(arg); 372 } 373 374 #ifndef PRODUCT 375 extern "C" void findpc(intptr_t x); 376 #endif 377 378 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) { 379 // In order to get locks to work, we need to fake a in_VM state 380 JavaThread* thread = JavaThread::current(); 381 JavaThreadState saved_state = thread->thread_state(); 382 thread->set_thread_state(_thread_in_vm); 383 if (ShowMessageBoxOnError) { 384 JavaThread* thread = JavaThread::current(); 385 JavaThreadState saved_state = thread->thread_state(); 386 thread->set_thread_state(_thread_in_vm); 387 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 388 ttyLocker ttyl; 389 BytecodeCounter::print(); 390 } 391 // To see where a verify_oop failed, get $ebx+40/X for this frame. 392 // This is the value of eip which points to where verify_oop will return. 393 if (os::message_box(msg, "Execution stopped, print registers?")) { 394 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip); 395 BREAKPOINT; 396 } 397 } 398 fatal("DEBUG MESSAGE: %s", msg); 399 } 400 401 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) { 402 ttyLocker ttyl; 403 FlagSetting fs(Debugging, true); 404 tty->print_cr("eip = 0x%08x", eip); 405 #ifndef PRODUCT 406 if ((WizardMode || Verbose) && PrintMiscellaneous) { 407 tty->cr(); 408 findpc(eip); 409 tty->cr(); 410 } 411 #endif 412 #define PRINT_REG(rax) \ 413 { tty->print("%s = ", #rax); os::print_location(tty, rax); } 414 PRINT_REG(rax); 415 PRINT_REG(rbx); 416 PRINT_REG(rcx); 417 PRINT_REG(rdx); 418 PRINT_REG(rdi); 419 PRINT_REG(rsi); 420 PRINT_REG(rbp); 421 PRINT_REG(rsp); 422 #undef PRINT_REG 423 // Print some words near top of staack. 424 int* dump_sp = (int*) rsp; 425 for (int col1 = 0; col1 < 8; col1++) { 426 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 427 os::print_location(tty, *dump_sp++); 428 } 429 for (int row = 0; row < 16; row++) { 430 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 431 for (int col = 0; col < 8; col++) { 432 tty->print(" 0x%08x", *dump_sp++); 433 } 434 tty->cr(); 435 } 436 // Print some instructions around pc: 437 Disassembler::decode((address)eip-64, (address)eip); 438 tty->print_cr("--------"); 439 Disassembler::decode((address)eip, (address)eip+32); 440 } 441 442 void MacroAssembler::stop(const char* msg) { 443 ExternalAddress message((address)msg); 444 // push address of message 445 pushptr(message.addr()); 446 { Label L; call(L, relocInfo::none); bind(L); } // push eip 447 pusha(); // push registers 448 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32))); 449 hlt(); 450 } 451 452 void MacroAssembler::warn(const char* msg) { 453 push_CPU_state(); 454 455 ExternalAddress message((address) msg); 456 // push address of message 457 pushptr(message.addr()); 458 459 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); 460 addl(rsp, wordSize); // discard argument 461 pop_CPU_state(); 462 } 463 464 void MacroAssembler::print_state() { 465 { Label L; call(L, relocInfo::none); bind(L); } // push eip 466 pusha(); // push registers 467 468 push_CPU_state(); 469 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32))); 470 pop_CPU_state(); 471 472 popa(); 473 addl(rsp, wordSize); 474 } 475 476 #else // _LP64 477 478 // 64 bit versions 479 480 Address MacroAssembler::as_Address(AddressLiteral adr) { 481 // amd64 always does this as a pc-rel 482 // we can be absolute or disp based on the instruction type 483 // jmp/call are displacements others are absolute 484 assert(!adr.is_lval(), "must be rval"); 485 assert(reachable(adr), "must be"); 486 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc()); 487 488 } 489 490 Address MacroAssembler::as_Address(ArrayAddress adr) { 491 AddressLiteral base = adr.base(); 492 lea(rscratch1, base); 493 Address index = adr.index(); 494 assert(index._disp == 0, "must not have disp"); // maybe it can? 495 Address array(rscratch1, index._index, index._scale, index._disp); 496 return array; 497 } 498 499 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { 500 Label L, E; 501 502 #ifdef _WIN64 503 // Windows always allocates space for it's register args 504 assert(num_args <= 4, "only register arguments supported"); 505 subq(rsp, frame::arg_reg_save_area_bytes); 506 #endif 507 508 // Align stack if necessary 509 testl(rsp, 15); 510 jcc(Assembler::zero, L); 511 512 subq(rsp, 8); 513 { 514 call(RuntimeAddress(entry_point)); 515 } 516 addq(rsp, 8); 517 jmp(E); 518 519 bind(L); 520 { 521 call(RuntimeAddress(entry_point)); 522 } 523 524 bind(E); 525 526 #ifdef _WIN64 527 // restore stack pointer 528 addq(rsp, frame::arg_reg_save_area_bytes); 529 #endif 530 531 } 532 533 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) { 534 assert(!src2.is_lval(), "should use cmpptr"); 535 536 if (reachable(src2)) { 537 cmpq(src1, as_Address(src2)); 538 } else { 539 lea(rscratch1, src2); 540 Assembler::cmpq(src1, Address(rscratch1, 0)); 541 } 542 } 543 544 int MacroAssembler::corrected_idivq(Register reg) { 545 // Full implementation of Java ldiv and lrem; checks for special 546 // case as described in JVM spec., p.243 & p.271. The function 547 // returns the (pc) offset of the idivl instruction - may be needed 548 // for implicit exceptions. 549 // 550 // normal case special case 551 // 552 // input : rax: dividend min_long 553 // reg: divisor (may not be eax/edx) -1 554 // 555 // output: rax: quotient (= rax idiv reg) min_long 556 // rdx: remainder (= rax irem reg) 0 557 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register"); 558 static const int64_t min_long = 0x8000000000000000; 559 Label normal_case, special_case; 560 561 // check for special case 562 cmp64(rax, ExternalAddress((address) &min_long)); 563 jcc(Assembler::notEqual, normal_case); 564 xorl(rdx, rdx); // prepare rdx for possible special case (where 565 // remainder = 0) 566 cmpq(reg, -1); 567 jcc(Assembler::equal, special_case); 568 569 // handle normal case 570 bind(normal_case); 571 cdqq(); 572 int idivq_offset = offset(); 573 idivq(reg); 574 575 // normal and special case exit 576 bind(special_case); 577 578 return idivq_offset; 579 } 580 581 void MacroAssembler::decrementq(Register reg, int value) { 582 if (value == min_jint) { subq(reg, value); return; } 583 if (value < 0) { incrementq(reg, -value); return; } 584 if (value == 0) { ; return; } 585 if (value == 1 && UseIncDec) { decq(reg) ; return; } 586 /* else */ { subq(reg, value) ; return; } 587 } 588 589 void MacroAssembler::decrementq(Address dst, int value) { 590 if (value == min_jint) { subq(dst, value); return; } 591 if (value < 0) { incrementq(dst, -value); return; } 592 if (value == 0) { ; return; } 593 if (value == 1 && UseIncDec) { decq(dst) ; return; } 594 /* else */ { subq(dst, value) ; return; } 595 } 596 597 void MacroAssembler::incrementq(AddressLiteral dst) { 598 if (reachable(dst)) { 599 incrementq(as_Address(dst)); 600 } else { 601 lea(rscratch1, dst); 602 incrementq(Address(rscratch1, 0)); 603 } 604 } 605 606 void MacroAssembler::incrementq(Register reg, int value) { 607 if (value == min_jint) { addq(reg, value); return; } 608 if (value < 0) { decrementq(reg, -value); return; } 609 if (value == 0) { ; return; } 610 if (value == 1 && UseIncDec) { incq(reg) ; return; } 611 /* else */ { addq(reg, value) ; return; } 612 } 613 614 void MacroAssembler::incrementq(Address dst, int value) { 615 if (value == min_jint) { addq(dst, value); return; } 616 if (value < 0) { decrementq(dst, -value); return; } 617 if (value == 0) { ; return; } 618 if (value == 1 && UseIncDec) { incq(dst) ; return; } 619 /* else */ { addq(dst, value) ; return; } 620 } 621 622 // 32bit can do a case table jump in one instruction but we no longer allow the base 623 // to be installed in the Address class 624 void MacroAssembler::jump(ArrayAddress entry) { 625 lea(rscratch1, entry.base()); 626 Address dispatch = entry.index(); 627 assert(dispatch._base == noreg, "must be"); 628 dispatch._base = rscratch1; 629 jmp(dispatch); 630 } 631 632 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 633 ShouldNotReachHere(); // 64bit doesn't use two regs 634 cmpq(x_lo, y_lo); 635 } 636 637 void MacroAssembler::lea(Register dst, AddressLiteral src) { 638 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 639 } 640 641 void MacroAssembler::lea(Address dst, AddressLiteral adr) { 642 mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec()); 643 movptr(dst, rscratch1); 644 } 645 646 void MacroAssembler::leave() { 647 // %%% is this really better? Why not on 32bit too? 648 emit_int8((unsigned char)0xC9); // LEAVE 649 } 650 651 void MacroAssembler::lneg(Register hi, Register lo) { 652 ShouldNotReachHere(); // 64bit doesn't use two regs 653 negq(lo); 654 } 655 656 void MacroAssembler::movoop(Register dst, jobject obj) { 657 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 658 } 659 660 void MacroAssembler::movoop(Address dst, jobject obj) { 661 mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 662 movq(dst, rscratch1); 663 } 664 665 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 666 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 667 } 668 669 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) { 670 mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 671 movq(dst, rscratch1); 672 } 673 674 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { 675 if (src.is_lval()) { 676 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 677 } else { 678 if (reachable(src)) { 679 movq(dst, as_Address(src)); 680 } else { 681 lea(scratch, src); 682 movq(dst, Address(scratch, 0)); 683 } 684 } 685 } 686 687 void MacroAssembler::movptr(ArrayAddress dst, Register src) { 688 movq(as_Address(dst), src); 689 } 690 691 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 692 movq(dst, as_Address(src)); 693 } 694 695 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 696 void MacroAssembler::movptr(Address dst, intptr_t src) { 697 if (is_simm32(src)) { 698 movptr(dst, checked_cast<int32_t>(src)); 699 } else { 700 mov64(rscratch1, src); 701 movq(dst, rscratch1); 702 } 703 } 704 705 // These are mostly for initializing NULL 706 void MacroAssembler::movptr(Address dst, int32_t src) { 707 movslq(dst, src); 708 } 709 710 void MacroAssembler::movptr(Register dst, int32_t src) { 711 mov64(dst, (intptr_t)src); 712 } 713 714 void MacroAssembler::pushoop(jobject obj) { 715 movoop(rscratch1, obj); 716 push(rscratch1); 717 } 718 719 void MacroAssembler::pushklass(Metadata* obj) { 720 mov_metadata(rscratch1, obj); 721 push(rscratch1); 722 } 723 724 void MacroAssembler::pushptr(AddressLiteral src) { 725 lea(rscratch1, src); 726 if (src.is_lval()) { 727 push(rscratch1); 728 } else { 729 pushq(Address(rscratch1, 0)); 730 } 731 } 732 733 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 734 reset_last_Java_frame(r15_thread, clear_fp); 735 } 736 737 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 738 Register last_java_fp, 739 address last_java_pc) { 740 vzeroupper(); 741 // determine last_java_sp register 742 if (!last_java_sp->is_valid()) { 743 last_java_sp = rsp; 744 } 745 746 // last_java_fp is optional 747 if (last_java_fp->is_valid()) { 748 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), 749 last_java_fp); 750 } 751 752 // last_java_pc is optional 753 if (last_java_pc != NULL) { 754 Address java_pc(r15_thread, 755 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()); 756 lea(rscratch1, InternalAddress(last_java_pc)); 757 movptr(java_pc, rscratch1); 758 } 759 760 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp); 761 } 762 763 static void pass_arg0(MacroAssembler* masm, Register arg) { 764 if (c_rarg0 != arg ) { 765 masm->mov(c_rarg0, arg); 766 } 767 } 768 769 static void pass_arg1(MacroAssembler* masm, Register arg) { 770 if (c_rarg1 != arg ) { 771 masm->mov(c_rarg1, arg); 772 } 773 } 774 775 static void pass_arg2(MacroAssembler* masm, Register arg) { 776 if (c_rarg2 != arg ) { 777 masm->mov(c_rarg2, arg); 778 } 779 } 780 781 static void pass_arg3(MacroAssembler* masm, Register arg) { 782 if (c_rarg3 != arg ) { 783 masm->mov(c_rarg3, arg); 784 } 785 } 786 787 void MacroAssembler::stop(const char* msg) { 788 if (ShowMessageBoxOnError) { 789 address rip = pc(); 790 pusha(); // get regs on stack 791 lea(c_rarg1, InternalAddress(rip)); 792 movq(c_rarg2, rsp); // pass pointer to regs array 793 } 794 lea(c_rarg0, ExternalAddress((address) msg)); 795 andq(rsp, -16); // align stack as required by ABI 796 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); 797 hlt(); 798 } 799 800 void MacroAssembler::warn(const char* msg) { 801 push(rbp); 802 movq(rbp, rsp); 803 andq(rsp, -16); // align stack as required by push_CPU_state and call 804 push_CPU_state(); // keeps alignment at 16 bytes 805 lea(c_rarg0, ExternalAddress((address) msg)); 806 lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning))); 807 call(rax); 808 pop_CPU_state(); 809 mov(rsp, rbp); 810 pop(rbp); 811 } 812 813 void MacroAssembler::print_state() { 814 address rip = pc(); 815 pusha(); // get regs on stack 816 push(rbp); 817 movq(rbp, rsp); 818 andq(rsp, -16); // align stack as required by push_CPU_state and call 819 push_CPU_state(); // keeps alignment at 16 bytes 820 821 lea(c_rarg0, InternalAddress(rip)); 822 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array 823 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1); 824 825 pop_CPU_state(); 826 mov(rsp, rbp); 827 pop(rbp); 828 popa(); 829 } 830 831 #ifndef PRODUCT 832 extern "C" void findpc(intptr_t x); 833 #endif 834 835 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) { 836 // In order to get locks to work, we need to fake a in_VM state 837 if (ShowMessageBoxOnError) { 838 JavaThread* thread = JavaThread::current(); 839 JavaThreadState saved_state = thread->thread_state(); 840 thread->set_thread_state(_thread_in_vm); 841 #ifndef PRODUCT 842 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 843 ttyLocker ttyl; 844 BytecodeCounter::print(); 845 } 846 #endif 847 // To see where a verify_oop failed, get $ebx+40/X for this frame. 848 // XXX correct this offset for amd64 849 // This is the value of eip which points to where verify_oop will return. 850 if (os::message_box(msg, "Execution stopped, print registers?")) { 851 print_state64(pc, regs); 852 BREAKPOINT; 853 } 854 } 855 fatal("DEBUG MESSAGE: %s", msg); 856 } 857 858 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) { 859 ttyLocker ttyl; 860 FlagSetting fs(Debugging, true); 861 tty->print_cr("rip = 0x%016lx", (intptr_t)pc); 862 #ifndef PRODUCT 863 tty->cr(); 864 findpc(pc); 865 tty->cr(); 866 #endif 867 #define PRINT_REG(rax, value) \ 868 { tty->print("%s = ", #rax); os::print_location(tty, value); } 869 PRINT_REG(rax, regs[15]); 870 PRINT_REG(rbx, regs[12]); 871 PRINT_REG(rcx, regs[14]); 872 PRINT_REG(rdx, regs[13]); 873 PRINT_REG(rdi, regs[8]); 874 PRINT_REG(rsi, regs[9]); 875 PRINT_REG(rbp, regs[10]); 876 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp 877 PRINT_REG(rsp, (intptr_t)(®s[16])); 878 PRINT_REG(r8 , regs[7]); 879 PRINT_REG(r9 , regs[6]); 880 PRINT_REG(r10, regs[5]); 881 PRINT_REG(r11, regs[4]); 882 PRINT_REG(r12, regs[3]); 883 PRINT_REG(r13, regs[2]); 884 PRINT_REG(r14, regs[1]); 885 PRINT_REG(r15, regs[0]); 886 #undef PRINT_REG 887 // Print some words near the top of the stack. 888 int64_t* rsp = ®s[16]; 889 int64_t* dump_sp = rsp; 890 for (int col1 = 0; col1 < 8; col1++) { 891 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 892 os::print_location(tty, *dump_sp++); 893 } 894 for (int row = 0; row < 25; row++) { 895 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 896 for (int col = 0; col < 4; col++) { 897 tty->print(" 0x%016lx", (intptr_t)*dump_sp++); 898 } 899 tty->cr(); 900 } 901 // Print some instructions around pc: 902 Disassembler::decode((address)pc-64, (address)pc); 903 tty->print_cr("--------"); 904 Disassembler::decode((address)pc, (address)pc+32); 905 } 906 907 // The java_calling_convention describes stack locations as ideal slots on 908 // a frame with no abi restrictions. Since we must observe abi restrictions 909 // (like the placement of the register window) the slots must be biased by 910 // the following value. 911 static int reg2offset_in(VMReg r) { 912 // Account for saved rbp and return address 913 // This should really be in_preserve_stack_slots 914 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size; 915 } 916 917 static int reg2offset_out(VMReg r) { 918 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size; 919 } 920 921 // A long move 922 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst) { 923 924 // The calling conventions assures us that each VMregpair is either 925 // all really one physical register or adjacent stack slots. 926 927 if (src.is_single_phys_reg() ) { 928 if (dst.is_single_phys_reg()) { 929 if (dst.first() != src.first()) { 930 mov(dst.first()->as_Register(), src.first()->as_Register()); 931 } 932 } else { 933 assert(dst.is_single_reg(), "not a stack pair"); 934 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register()); 935 } 936 } else if (dst.is_single_phys_reg()) { 937 assert(src.is_single_reg(), "not a stack pair"); 938 movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first()))); 939 } else { 940 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs"); 941 movq(rax, Address(rbp, reg2offset_in(src.first()))); 942 movq(Address(rsp, reg2offset_out(dst.first())), rax); 943 } 944 } 945 946 // A double move 947 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst) { 948 949 // The calling conventions assures us that each VMregpair is either 950 // all really one physical register or adjacent stack slots. 951 952 if (src.is_single_phys_reg() ) { 953 if (dst.is_single_phys_reg()) { 954 // In theory these overlap but the ordering is such that this is likely a nop 955 if ( src.first() != dst.first()) { 956 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister()); 957 } 958 } else { 959 assert(dst.is_single_reg(), "not a stack pair"); 960 movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister()); 961 } 962 } else if (dst.is_single_phys_reg()) { 963 assert(src.is_single_reg(), "not a stack pair"); 964 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first()))); 965 } else { 966 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs"); 967 movq(rax, Address(rbp, reg2offset_in(src.first()))); 968 movq(Address(rsp, reg2offset_out(dst.first())), rax); 969 } 970 } 971 972 973 // A float arg may have to do float reg int reg conversion 974 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst) { 975 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move"); 976 977 // The calling conventions assures us that each VMregpair is either 978 // all really one physical register or adjacent stack slots. 979 980 if (src.first()->is_stack()) { 981 if (dst.first()->is_stack()) { 982 movl(rax, Address(rbp, reg2offset_in(src.first()))); 983 movptr(Address(rsp, reg2offset_out(dst.first())), rax); 984 } else { 985 // stack to reg 986 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters"); 987 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()))); 988 } 989 } else if (dst.first()->is_stack()) { 990 // reg to stack 991 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters"); 992 movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister()); 993 } else { 994 // reg to reg 995 // In theory these overlap but the ordering is such that this is likely a nop 996 if ( src.first() != dst.first()) { 997 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister()); 998 } 999 } 1000 } 1001 1002 // On 64 bit we will store integer like items to the stack as 1003 // 64 bits items (x86_32/64 abi) even though java would only store 1004 // 32bits for a parameter. On 32bit it will simply be 32 bits 1005 // So this routine will do 32->32 on 32bit and 32->64 on 64bit 1006 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst) { 1007 if (src.first()->is_stack()) { 1008 if (dst.first()->is_stack()) { 1009 // stack to stack 1010 movslq(rax, Address(rbp, reg2offset_in(src.first()))); 1011 movq(Address(rsp, reg2offset_out(dst.first())), rax); 1012 } else { 1013 // stack to reg 1014 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()))); 1015 } 1016 } else if (dst.first()->is_stack()) { 1017 // reg to stack 1018 // Do we really have to sign extend??? 1019 // __ movslq(src.first()->as_Register(), src.first()->as_Register()); 1020 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register()); 1021 } else { 1022 // Do we really have to sign extend??? 1023 // __ movslq(dst.first()->as_Register(), src.first()->as_Register()); 1024 if (dst.first() != src.first()) { 1025 movq(dst.first()->as_Register(), src.first()->as_Register()); 1026 } 1027 } 1028 } 1029 1030 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) { 1031 if (src.first()->is_stack()) { 1032 if (dst.first()->is_stack()) { 1033 // stack to stack 1034 movq(rax, Address(rbp, reg2offset_in(src.first()))); 1035 movq(Address(rsp, reg2offset_out(dst.first())), rax); 1036 } else { 1037 // stack to reg 1038 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()))); 1039 } 1040 } else if (dst.first()->is_stack()) { 1041 // reg to stack 1042 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register()); 1043 } else { 1044 if (dst.first() != src.first()) { 1045 movq(dst.first()->as_Register(), src.first()->as_Register()); 1046 } 1047 } 1048 } 1049 1050 // An oop arg. Must pass a handle not the oop itself 1051 void MacroAssembler::object_move(OopMap* map, 1052 int oop_handle_offset, 1053 int framesize_in_slots, 1054 VMRegPair src, 1055 VMRegPair dst, 1056 bool is_receiver, 1057 int* receiver_offset) { 1058 1059 // must pass a handle. First figure out the location we use as a handle 1060 1061 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register(); 1062 1063 // See if oop is NULL if it is we need no handle 1064 1065 if (src.first()->is_stack()) { 1066 1067 // Oop is already on the stack as an argument 1068 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots(); 1069 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots)); 1070 if (is_receiver) { 1071 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size; 1072 } 1073 1074 cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD); 1075 lea(rHandle, Address(rbp, reg2offset_in(src.first()))); 1076 // conditionally move a NULL 1077 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first()))); 1078 } else { 1079 1080 // Oop is in an a register we must store it to the space we reserve 1081 // on the stack for oop_handles and pass a handle if oop is non-NULL 1082 1083 const Register rOop = src.first()->as_Register(); 1084 int oop_slot; 1085 if (rOop == j_rarg0) 1086 oop_slot = 0; 1087 else if (rOop == j_rarg1) 1088 oop_slot = 1; 1089 else if (rOop == j_rarg2) 1090 oop_slot = 2; 1091 else if (rOop == j_rarg3) 1092 oop_slot = 3; 1093 else if (rOop == j_rarg4) 1094 oop_slot = 4; 1095 else { 1096 assert(rOop == j_rarg5, "wrong register"); 1097 oop_slot = 5; 1098 } 1099 1100 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset; 1101 int offset = oop_slot*VMRegImpl::stack_slot_size; 1102 1103 map->set_oop(VMRegImpl::stack2reg(oop_slot)); 1104 // Store oop in handle area, may be NULL 1105 movptr(Address(rsp, offset), rOop); 1106 if (is_receiver) { 1107 *receiver_offset = offset; 1108 } 1109 1110 cmpptr(rOop, (int32_t)NULL_WORD); 1111 lea(rHandle, Address(rsp, offset)); 1112 // conditionally move a NULL from the handle area where it was just stored 1113 cmovptr(Assembler::equal, rHandle, Address(rsp, offset)); 1114 } 1115 1116 // If arg is on the stack then place it otherwise it is already in correct reg. 1117 if (dst.first()->is_stack()) { 1118 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle); 1119 } 1120 } 1121 1122 #endif // _LP64 1123 1124 // Now versions that are common to 32/64 bit 1125 1126 void MacroAssembler::addptr(Register dst, int32_t imm32) { 1127 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32)); 1128 } 1129 1130 void MacroAssembler::addptr(Register dst, Register src) { 1131 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 1132 } 1133 1134 void MacroAssembler::addptr(Address dst, Register src) { 1135 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 1136 } 1137 1138 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) { 1139 if (reachable(src)) { 1140 Assembler::addsd(dst, as_Address(src)); 1141 } else { 1142 lea(rscratch1, src); 1143 Assembler::addsd(dst, Address(rscratch1, 0)); 1144 } 1145 } 1146 1147 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) { 1148 if (reachable(src)) { 1149 addss(dst, as_Address(src)); 1150 } else { 1151 lea(rscratch1, src); 1152 addss(dst, Address(rscratch1, 0)); 1153 } 1154 } 1155 1156 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) { 1157 if (reachable(src)) { 1158 Assembler::addpd(dst, as_Address(src)); 1159 } else { 1160 lea(rscratch1, src); 1161 Assembler::addpd(dst, Address(rscratch1, 0)); 1162 } 1163 } 1164 1165 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only. 1166 // Stub code is generated once and never copied. 1167 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes. 1168 void MacroAssembler::align64() { 1169 align(64, (unsigned long long) pc()); 1170 } 1171 1172 void MacroAssembler::align32() { 1173 align(32, (unsigned long long) pc()); 1174 } 1175 1176 void MacroAssembler::align(int modulus) { 1177 // 8273459: Ensure alignment is possible with current segment alignment 1178 assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment"); 1179 align(modulus, offset()); 1180 } 1181 1182 void MacroAssembler::align(int modulus, int target) { 1183 if (target % modulus != 0) { 1184 nop(modulus - (target % modulus)); 1185 } 1186 } 1187 1188 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 1189 // Used in sign-masking with aligned address. 1190 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 1191 if (reachable(src)) { 1192 Assembler::andpd(dst, as_Address(src)); 1193 } else { 1194 lea(scratch_reg, src); 1195 Assembler::andpd(dst, Address(scratch_reg, 0)); 1196 } 1197 } 1198 1199 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 1200 // Used in sign-masking with aligned address. 1201 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 1202 if (reachable(src)) { 1203 Assembler::andps(dst, as_Address(src)); 1204 } else { 1205 lea(scratch_reg, src); 1206 Assembler::andps(dst, Address(scratch_reg, 0)); 1207 } 1208 } 1209 1210 void MacroAssembler::andptr(Register dst, int32_t imm32) { 1211 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); 1212 } 1213 1214 void MacroAssembler::atomic_incl(Address counter_addr) { 1215 lock(); 1216 incrementl(counter_addr); 1217 } 1218 1219 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) { 1220 if (reachable(counter_addr)) { 1221 atomic_incl(as_Address(counter_addr)); 1222 } else { 1223 lea(scr, counter_addr); 1224 atomic_incl(Address(scr, 0)); 1225 } 1226 } 1227 1228 #ifdef _LP64 1229 void MacroAssembler::atomic_incq(Address counter_addr) { 1230 lock(); 1231 incrementq(counter_addr); 1232 } 1233 1234 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) { 1235 if (reachable(counter_addr)) { 1236 atomic_incq(as_Address(counter_addr)); 1237 } else { 1238 lea(scr, counter_addr); 1239 atomic_incq(Address(scr, 0)); 1240 } 1241 } 1242 #endif 1243 1244 // Writes to stack successive pages until offset reached to check for 1245 // stack overflow + shadow pages. This clobbers tmp. 1246 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 1247 movptr(tmp, rsp); 1248 // Bang stack for total size given plus shadow page size. 1249 // Bang one page at a time because large size can bang beyond yellow and 1250 // red zones. 1251 Label loop; 1252 bind(loop); 1253 movl(Address(tmp, (-os::vm_page_size())), size ); 1254 subptr(tmp, os::vm_page_size()); 1255 subl(size, os::vm_page_size()); 1256 jcc(Assembler::greater, loop); 1257 1258 // Bang down shadow pages too. 1259 // At this point, (tmp-0) is the last address touched, so don't 1260 // touch it again. (It was touched as (tmp-pagesize) but then tmp 1261 // was post-decremented.) Skip this address by starting at i=1, and 1262 // touch a few more pages below. N.B. It is important to touch all 1263 // the way down including all pages in the shadow zone. 1264 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) { 1265 // this could be any sized move but this is can be a debugging crumb 1266 // so the bigger the better. 1267 movptr(Address(tmp, (-i*os::vm_page_size())), size ); 1268 } 1269 } 1270 1271 void MacroAssembler::reserved_stack_check() { 1272 // testing if reserved zone needs to be enabled 1273 Label no_reserved_zone_enabling; 1274 Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread); 1275 NOT_LP64(get_thread(rsi);) 1276 1277 cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset())); 1278 jcc(Assembler::below, no_reserved_zone_enabling); 1279 1280 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread); 1281 jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 1282 should_not_reach_here(); 1283 1284 bind(no_reserved_zone_enabling); 1285 } 1286 1287 void MacroAssembler::c2bool(Register x) { 1288 // implements x == 0 ? 0 : 1 1289 // note: must only look at least-significant byte of x 1290 // since C-style booleans are stored in one byte 1291 // only! (was bug) 1292 andl(x, 0xFF); 1293 setb(Assembler::notZero, x); 1294 } 1295 1296 // Wouldn't need if AddressLiteral version had new name 1297 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) { 1298 Assembler::call(L, rtype); 1299 } 1300 1301 void MacroAssembler::call(Register entry) { 1302 Assembler::call(entry); 1303 } 1304 1305 void MacroAssembler::call(AddressLiteral entry) { 1306 if (reachable(entry)) { 1307 Assembler::call_literal(entry.target(), entry.rspec()); 1308 } else { 1309 lea(rscratch1, entry); 1310 Assembler::call(rscratch1); 1311 } 1312 } 1313 1314 void MacroAssembler::ic_call(address entry, jint method_index) { 1315 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 1316 movptr(rax, (intptr_t)Universe::non_oop_word()); 1317 call(AddressLiteral(entry, rh)); 1318 } 1319 1320 // Implementation of call_VM versions 1321 1322 void MacroAssembler::call_VM(Register oop_result, 1323 address entry_point, 1324 bool check_exceptions) { 1325 Label C, E; 1326 call(C, relocInfo::none); 1327 jmp(E); 1328 1329 bind(C); 1330 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 1331 ret(0); 1332 1333 bind(E); 1334 } 1335 1336 void MacroAssembler::call_VM(Register oop_result, 1337 address entry_point, 1338 Register arg_1, 1339 bool check_exceptions) { 1340 Label C, E; 1341 call(C, relocInfo::none); 1342 jmp(E); 1343 1344 bind(C); 1345 pass_arg1(this, arg_1); 1346 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 1347 ret(0); 1348 1349 bind(E); 1350 } 1351 1352 void MacroAssembler::call_VM(Register oop_result, 1353 address entry_point, 1354 Register arg_1, 1355 Register arg_2, 1356 bool check_exceptions) { 1357 Label C, E; 1358 call(C, relocInfo::none); 1359 jmp(E); 1360 1361 bind(C); 1362 1363 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1364 1365 pass_arg2(this, arg_2); 1366 pass_arg1(this, arg_1); 1367 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 1368 ret(0); 1369 1370 bind(E); 1371 } 1372 1373 void MacroAssembler::call_VM(Register oop_result, 1374 address entry_point, 1375 Register arg_1, 1376 Register arg_2, 1377 Register arg_3, 1378 bool check_exceptions) { 1379 Label C, E; 1380 call(C, relocInfo::none); 1381 jmp(E); 1382 1383 bind(C); 1384 1385 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1386 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1387 pass_arg3(this, arg_3); 1388 1389 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1390 pass_arg2(this, arg_2); 1391 1392 pass_arg1(this, arg_1); 1393 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 1394 ret(0); 1395 1396 bind(E); 1397 } 1398 1399 void MacroAssembler::call_VM(Register oop_result, 1400 Register last_java_sp, 1401 address entry_point, 1402 int number_of_arguments, 1403 bool check_exceptions) { 1404 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 1405 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1406 } 1407 1408 void MacroAssembler::call_VM(Register oop_result, 1409 Register last_java_sp, 1410 address entry_point, 1411 Register arg_1, 1412 bool check_exceptions) { 1413 pass_arg1(this, arg_1); 1414 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1415 } 1416 1417 void MacroAssembler::call_VM(Register oop_result, 1418 Register last_java_sp, 1419 address entry_point, 1420 Register arg_1, 1421 Register arg_2, 1422 bool check_exceptions) { 1423 1424 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1425 pass_arg2(this, arg_2); 1426 pass_arg1(this, arg_1); 1427 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1428 } 1429 1430 void MacroAssembler::call_VM(Register oop_result, 1431 Register last_java_sp, 1432 address entry_point, 1433 Register arg_1, 1434 Register arg_2, 1435 Register arg_3, 1436 bool check_exceptions) { 1437 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1438 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1439 pass_arg3(this, arg_3); 1440 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1441 pass_arg2(this, arg_2); 1442 pass_arg1(this, arg_1); 1443 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1444 } 1445 1446 void MacroAssembler::super_call_VM(Register oop_result, 1447 Register last_java_sp, 1448 address entry_point, 1449 int number_of_arguments, 1450 bool check_exceptions) { 1451 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 1452 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1453 } 1454 1455 void MacroAssembler::super_call_VM(Register oop_result, 1456 Register last_java_sp, 1457 address entry_point, 1458 Register arg_1, 1459 bool check_exceptions) { 1460 pass_arg1(this, arg_1); 1461 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1462 } 1463 1464 void MacroAssembler::super_call_VM(Register oop_result, 1465 Register last_java_sp, 1466 address entry_point, 1467 Register arg_1, 1468 Register arg_2, 1469 bool check_exceptions) { 1470 1471 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1472 pass_arg2(this, arg_2); 1473 pass_arg1(this, arg_1); 1474 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1475 } 1476 1477 void MacroAssembler::super_call_VM(Register oop_result, 1478 Register last_java_sp, 1479 address entry_point, 1480 Register arg_1, 1481 Register arg_2, 1482 Register arg_3, 1483 bool check_exceptions) { 1484 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1485 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1486 pass_arg3(this, arg_3); 1487 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1488 pass_arg2(this, arg_2); 1489 pass_arg1(this, arg_1); 1490 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1491 } 1492 1493 void MacroAssembler::call_VM_base(Register oop_result, 1494 Register java_thread, 1495 Register last_java_sp, 1496 address entry_point, 1497 int number_of_arguments, 1498 bool check_exceptions) { 1499 // determine java_thread register 1500 if (!java_thread->is_valid()) { 1501 #ifdef _LP64 1502 java_thread = r15_thread; 1503 #else 1504 java_thread = rdi; 1505 get_thread(java_thread); 1506 #endif // LP64 1507 } 1508 // determine last_java_sp register 1509 if (!last_java_sp->is_valid()) { 1510 last_java_sp = rsp; 1511 } 1512 // debugging support 1513 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 1514 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register")); 1515 #ifdef ASSERT 1516 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 1517 // r12 is the heapbase. 1518 LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");) 1519 #endif // ASSERT 1520 1521 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 1522 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 1523 1524 // push java thread (becomes first argument of C function) 1525 1526 NOT_LP64(push(java_thread); number_of_arguments++); 1527 LP64_ONLY(mov(c_rarg0, r15_thread)); 1528 1529 // set last Java frame before call 1530 assert(last_java_sp != rbp, "can't use ebp/rbp"); 1531 1532 // Only interpreter should have to set fp 1533 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL); 1534 1535 // do the call, remove parameters 1536 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments); 1537 1538 // restore the thread (cannot use the pushed argument since arguments 1539 // may be overwritten by C code generated by an optimizing compiler); 1540 // however can use the register value directly if it is callee saved. 1541 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) { 1542 // rdi & rsi (also r15) are callee saved -> nothing to do 1543 #ifdef ASSERT 1544 guarantee(java_thread != rax, "change this code"); 1545 push(rax); 1546 { Label L; 1547 get_thread(rax); 1548 cmpptr(java_thread, rax); 1549 jcc(Assembler::equal, L); 1550 STOP("MacroAssembler::call_VM_base: rdi not callee saved?"); 1551 bind(L); 1552 } 1553 pop(rax); 1554 #endif 1555 } else { 1556 get_thread(java_thread); 1557 } 1558 // reset last Java frame 1559 // Only interpreter should have to clear fp 1560 reset_last_Java_frame(java_thread, true); 1561 1562 // C++ interp handles this in the interpreter 1563 check_and_handle_popframe(java_thread); 1564 check_and_handle_earlyret(java_thread); 1565 1566 if (check_exceptions) { 1567 // check for pending exceptions (java_thread is set upon return) 1568 cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD); 1569 #ifndef _LP64 1570 jump_cc(Assembler::notEqual, 1571 RuntimeAddress(StubRoutines::forward_exception_entry())); 1572 #else 1573 // This used to conditionally jump to forward_exception however it is 1574 // possible if we relocate that the branch will not reach. So we must jump 1575 // around so we can always reach 1576 1577 Label ok; 1578 jcc(Assembler::equal, ok); 1579 jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1580 bind(ok); 1581 #endif // LP64 1582 } 1583 1584 // get oop result if there is one and reset the value in the thread 1585 if (oop_result->is_valid()) { 1586 get_vm_result(oop_result, java_thread); 1587 } 1588 } 1589 1590 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 1591 1592 // Calculate the value for last_Java_sp 1593 // somewhat subtle. call_VM does an intermediate call 1594 // which places a return address on the stack just under the 1595 // stack pointer as the user finsihed with it. This allows 1596 // use to retrieve last_Java_pc from last_Java_sp[-1]. 1597 // On 32bit we then have to push additional args on the stack to accomplish 1598 // the actual requested call. On 64bit call_VM only can use register args 1599 // so the only extra space is the return address that call_VM created. 1600 // This hopefully explains the calculations here. 1601 1602 #ifdef _LP64 1603 // We've pushed one address, correct last_Java_sp 1604 lea(rax, Address(rsp, wordSize)); 1605 #else 1606 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize)); 1607 #endif // LP64 1608 1609 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions); 1610 1611 } 1612 1613 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter. 1614 void MacroAssembler::call_VM_leaf0(address entry_point) { 1615 MacroAssembler::call_VM_leaf_base(entry_point, 0); 1616 } 1617 1618 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1619 call_VM_leaf_base(entry_point, number_of_arguments); 1620 } 1621 1622 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1623 pass_arg0(this, arg_0); 1624 call_VM_leaf(entry_point, 1); 1625 } 1626 1627 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1628 1629 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1630 pass_arg1(this, arg_1); 1631 pass_arg0(this, arg_0); 1632 call_VM_leaf(entry_point, 2); 1633 } 1634 1635 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1636 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1637 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1638 pass_arg2(this, arg_2); 1639 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1640 pass_arg1(this, arg_1); 1641 pass_arg0(this, arg_0); 1642 call_VM_leaf(entry_point, 3); 1643 } 1644 1645 void MacroAssembler::super_call_VM_leaf(address entry_point) { 1646 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1647 } 1648 1649 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1650 pass_arg0(this, arg_0); 1651 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1652 } 1653 1654 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1655 1656 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1657 pass_arg1(this, arg_1); 1658 pass_arg0(this, arg_0); 1659 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1660 } 1661 1662 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1663 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1664 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1665 pass_arg2(this, arg_2); 1666 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1667 pass_arg1(this, arg_1); 1668 pass_arg0(this, arg_0); 1669 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1670 } 1671 1672 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1673 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg")); 1674 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1675 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1676 pass_arg3(this, arg_3); 1677 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1678 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1679 pass_arg2(this, arg_2); 1680 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1681 pass_arg1(this, arg_1); 1682 pass_arg0(this, arg_0); 1683 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1684 } 1685 1686 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 1687 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 1688 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD); 1689 verify_oop_msg(oop_result, "broken oop in call_VM_base"); 1690 } 1691 1692 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 1693 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 1694 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD); 1695 } 1696 1697 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { 1698 } 1699 1700 void MacroAssembler::check_and_handle_popframe(Register java_thread) { 1701 } 1702 1703 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) { 1704 if (reachable(src1)) { 1705 cmpl(as_Address(src1), imm); 1706 } else { 1707 lea(rscratch1, src1); 1708 cmpl(Address(rscratch1, 0), imm); 1709 } 1710 } 1711 1712 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) { 1713 assert(!src2.is_lval(), "use cmpptr"); 1714 if (reachable(src2)) { 1715 cmpl(src1, as_Address(src2)); 1716 } else { 1717 lea(rscratch1, src2); 1718 cmpl(src1, Address(rscratch1, 0)); 1719 } 1720 } 1721 1722 void MacroAssembler::cmp32(Register src1, int32_t imm) { 1723 Assembler::cmpl(src1, imm); 1724 } 1725 1726 void MacroAssembler::cmp32(Register src1, Address src2) { 1727 Assembler::cmpl(src1, src2); 1728 } 1729 1730 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1731 ucomisd(opr1, opr2); 1732 1733 Label L; 1734 if (unordered_is_less) { 1735 movl(dst, -1); 1736 jcc(Assembler::parity, L); 1737 jcc(Assembler::below , L); 1738 movl(dst, 0); 1739 jcc(Assembler::equal , L); 1740 increment(dst); 1741 } else { // unordered is greater 1742 movl(dst, 1); 1743 jcc(Assembler::parity, L); 1744 jcc(Assembler::above , L); 1745 movl(dst, 0); 1746 jcc(Assembler::equal , L); 1747 decrementl(dst); 1748 } 1749 bind(L); 1750 } 1751 1752 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1753 ucomiss(opr1, opr2); 1754 1755 Label L; 1756 if (unordered_is_less) { 1757 movl(dst, -1); 1758 jcc(Assembler::parity, L); 1759 jcc(Assembler::below , L); 1760 movl(dst, 0); 1761 jcc(Assembler::equal , L); 1762 increment(dst); 1763 } else { // unordered is greater 1764 movl(dst, 1); 1765 jcc(Assembler::parity, L); 1766 jcc(Assembler::above , L); 1767 movl(dst, 0); 1768 jcc(Assembler::equal , L); 1769 decrementl(dst); 1770 } 1771 bind(L); 1772 } 1773 1774 1775 void MacroAssembler::cmp8(AddressLiteral src1, int imm) { 1776 if (reachable(src1)) { 1777 cmpb(as_Address(src1), imm); 1778 } else { 1779 lea(rscratch1, src1); 1780 cmpb(Address(rscratch1, 0), imm); 1781 } 1782 } 1783 1784 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) { 1785 #ifdef _LP64 1786 if (src2.is_lval()) { 1787 movptr(rscratch1, src2); 1788 Assembler::cmpq(src1, rscratch1); 1789 } else if (reachable(src2)) { 1790 cmpq(src1, as_Address(src2)); 1791 } else { 1792 lea(rscratch1, src2); 1793 Assembler::cmpq(src1, Address(rscratch1, 0)); 1794 } 1795 #else 1796 if (src2.is_lval()) { 1797 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec()); 1798 } else { 1799 cmpl(src1, as_Address(src2)); 1800 } 1801 #endif // _LP64 1802 } 1803 1804 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) { 1805 assert(src2.is_lval(), "not a mem-mem compare"); 1806 #ifdef _LP64 1807 // moves src2's literal address 1808 movptr(rscratch1, src2); 1809 Assembler::cmpq(src1, rscratch1); 1810 #else 1811 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec()); 1812 #endif // _LP64 1813 } 1814 1815 void MacroAssembler::cmpoop(Register src1, Register src2) { 1816 cmpptr(src1, src2); 1817 } 1818 1819 void MacroAssembler::cmpoop(Register src1, Address src2) { 1820 cmpptr(src1, src2); 1821 } 1822 1823 #ifdef _LP64 1824 void MacroAssembler::cmpoop(Register src1, jobject src2) { 1825 movoop(rscratch1, src2); 1826 cmpptr(src1, rscratch1); 1827 } 1828 #endif 1829 1830 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) { 1831 if (reachable(adr)) { 1832 lock(); 1833 cmpxchgptr(reg, as_Address(adr)); 1834 } else { 1835 lea(rscratch1, adr); 1836 lock(); 1837 cmpxchgptr(reg, Address(rscratch1, 0)); 1838 } 1839 } 1840 1841 void MacroAssembler::cmpxchgptr(Register reg, Address adr) { 1842 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr)); 1843 } 1844 1845 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) { 1846 if (reachable(src)) { 1847 Assembler::comisd(dst, as_Address(src)); 1848 } else { 1849 lea(rscratch1, src); 1850 Assembler::comisd(dst, Address(rscratch1, 0)); 1851 } 1852 } 1853 1854 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) { 1855 if (reachable(src)) { 1856 Assembler::comiss(dst, as_Address(src)); 1857 } else { 1858 lea(rscratch1, src); 1859 Assembler::comiss(dst, Address(rscratch1, 0)); 1860 } 1861 } 1862 1863 1864 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) { 1865 Condition negated_cond = negate_condition(cond); 1866 Label L; 1867 jcc(negated_cond, L); 1868 pushf(); // Preserve flags 1869 atomic_incl(counter_addr); 1870 popf(); 1871 bind(L); 1872 } 1873 1874 int MacroAssembler::corrected_idivl(Register reg) { 1875 // Full implementation of Java idiv and irem; checks for 1876 // special case as described in JVM spec., p.243 & p.271. 1877 // The function returns the (pc) offset of the idivl 1878 // instruction - may be needed for implicit exceptions. 1879 // 1880 // normal case special case 1881 // 1882 // input : rax,: dividend min_int 1883 // reg: divisor (may not be rax,/rdx) -1 1884 // 1885 // output: rax,: quotient (= rax, idiv reg) min_int 1886 // rdx: remainder (= rax, irem reg) 0 1887 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register"); 1888 const int min_int = 0x80000000; 1889 Label normal_case, special_case; 1890 1891 // check for special case 1892 cmpl(rax, min_int); 1893 jcc(Assembler::notEqual, normal_case); 1894 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0) 1895 cmpl(reg, -1); 1896 jcc(Assembler::equal, special_case); 1897 1898 // handle normal case 1899 bind(normal_case); 1900 cdql(); 1901 int idivl_offset = offset(); 1902 idivl(reg); 1903 1904 // normal and special case exit 1905 bind(special_case); 1906 1907 return idivl_offset; 1908 } 1909 1910 1911 1912 void MacroAssembler::decrementl(Register reg, int value) { 1913 if (value == min_jint) {subl(reg, value) ; return; } 1914 if (value < 0) { incrementl(reg, -value); return; } 1915 if (value == 0) { ; return; } 1916 if (value == 1 && UseIncDec) { decl(reg) ; return; } 1917 /* else */ { subl(reg, value) ; return; } 1918 } 1919 1920 void MacroAssembler::decrementl(Address dst, int value) { 1921 if (value == min_jint) {subl(dst, value) ; return; } 1922 if (value < 0) { incrementl(dst, -value); return; } 1923 if (value == 0) { ; return; } 1924 if (value == 1 && UseIncDec) { decl(dst) ; return; } 1925 /* else */ { subl(dst, value) ; return; } 1926 } 1927 1928 void MacroAssembler::division_with_shift (Register reg, int shift_value) { 1929 assert (shift_value > 0, "illegal shift value"); 1930 Label _is_positive; 1931 testl (reg, reg); 1932 jcc (Assembler::positive, _is_positive); 1933 int offset = (1 << shift_value) - 1 ; 1934 1935 if (offset == 1) { 1936 incrementl(reg); 1937 } else { 1938 addl(reg, offset); 1939 } 1940 1941 bind (_is_positive); 1942 sarl(reg, shift_value); 1943 } 1944 1945 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) { 1946 if (reachable(src)) { 1947 Assembler::divsd(dst, as_Address(src)); 1948 } else { 1949 lea(rscratch1, src); 1950 Assembler::divsd(dst, Address(rscratch1, 0)); 1951 } 1952 } 1953 1954 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) { 1955 if (reachable(src)) { 1956 Assembler::divss(dst, as_Address(src)); 1957 } else { 1958 lea(rscratch1, src); 1959 Assembler::divss(dst, Address(rscratch1, 0)); 1960 } 1961 } 1962 1963 void MacroAssembler::enter() { 1964 push(rbp); 1965 mov(rbp, rsp); 1966 } 1967 1968 // A 5 byte nop that is safe for patching (see patch_verified_entry) 1969 void MacroAssembler::fat_nop() { 1970 if (UseAddressNop) { 1971 addr_nop_5(); 1972 } else { 1973 emit_int8(0x26); // es: 1974 emit_int8(0x2e); // cs: 1975 emit_int8(0x64); // fs: 1976 emit_int8(0x65); // gs: 1977 emit_int8((unsigned char)0x90); 1978 } 1979 } 1980 1981 #ifndef _LP64 1982 void MacroAssembler::fcmp(Register tmp) { 1983 fcmp(tmp, 1, true, true); 1984 } 1985 1986 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) { 1987 assert(!pop_right || pop_left, "usage error"); 1988 if (VM_Version::supports_cmov()) { 1989 assert(tmp == noreg, "unneeded temp"); 1990 if (pop_left) { 1991 fucomip(index); 1992 } else { 1993 fucomi(index); 1994 } 1995 if (pop_right) { 1996 fpop(); 1997 } 1998 } else { 1999 assert(tmp != noreg, "need temp"); 2000 if (pop_left) { 2001 if (pop_right) { 2002 fcompp(); 2003 } else { 2004 fcomp(index); 2005 } 2006 } else { 2007 fcom(index); 2008 } 2009 // convert FPU condition into eflags condition via rax, 2010 save_rax(tmp); 2011 fwait(); fnstsw_ax(); 2012 sahf(); 2013 restore_rax(tmp); 2014 } 2015 // condition codes set as follows: 2016 // 2017 // CF (corresponds to C0) if x < y 2018 // PF (corresponds to C2) if unordered 2019 // ZF (corresponds to C3) if x = y 2020 } 2021 2022 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) { 2023 fcmp2int(dst, unordered_is_less, 1, true, true); 2024 } 2025 2026 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) { 2027 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right); 2028 Label L; 2029 if (unordered_is_less) { 2030 movl(dst, -1); 2031 jcc(Assembler::parity, L); 2032 jcc(Assembler::below , L); 2033 movl(dst, 0); 2034 jcc(Assembler::equal , L); 2035 increment(dst); 2036 } else { // unordered is greater 2037 movl(dst, 1); 2038 jcc(Assembler::parity, L); 2039 jcc(Assembler::above , L); 2040 movl(dst, 0); 2041 jcc(Assembler::equal , L); 2042 decrementl(dst); 2043 } 2044 bind(L); 2045 } 2046 2047 void MacroAssembler::fld_d(AddressLiteral src) { 2048 fld_d(as_Address(src)); 2049 } 2050 2051 void MacroAssembler::fld_s(AddressLiteral src) { 2052 fld_s(as_Address(src)); 2053 } 2054 2055 void MacroAssembler::fldcw(AddressLiteral src) { 2056 Assembler::fldcw(as_Address(src)); 2057 } 2058 2059 void MacroAssembler::fpop() { 2060 ffree(); 2061 fincstp(); 2062 } 2063 2064 void MacroAssembler::fremr(Register tmp) { 2065 save_rax(tmp); 2066 { Label L; 2067 bind(L); 2068 fprem(); 2069 fwait(); fnstsw_ax(); 2070 sahf(); 2071 jcc(Assembler::parity, L); 2072 } 2073 restore_rax(tmp); 2074 // Result is in ST0. 2075 // Note: fxch & fpop to get rid of ST1 2076 // (otherwise FPU stack could overflow eventually) 2077 fxch(1); 2078 fpop(); 2079 } 2080 2081 void MacroAssembler::empty_FPU_stack() { 2082 if (VM_Version::supports_mmx()) { 2083 emms(); 2084 } else { 2085 for (int i = 8; i-- > 0; ) ffree(i); 2086 } 2087 } 2088 #endif // !LP64 2089 2090 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) { 2091 if (reachable(src)) { 2092 Assembler::mulpd(dst, as_Address(src)); 2093 } else { 2094 lea(rscratch1, src); 2095 Assembler::mulpd(dst, Address(rscratch1, 0)); 2096 } 2097 } 2098 2099 void MacroAssembler::load_float(Address src) { 2100 #ifdef _LP64 2101 movflt(xmm0, src); 2102 #else 2103 if (UseSSE >= 1) { 2104 movflt(xmm0, src); 2105 } else { 2106 fld_s(src); 2107 } 2108 #endif // LP64 2109 } 2110 2111 void MacroAssembler::store_float(Address dst) { 2112 #ifdef _LP64 2113 movflt(dst, xmm0); 2114 #else 2115 if (UseSSE >= 1) { 2116 movflt(dst, xmm0); 2117 } else { 2118 fstp_s(dst); 2119 } 2120 #endif // LP64 2121 } 2122 2123 void MacroAssembler::load_double(Address src) { 2124 #ifdef _LP64 2125 movdbl(xmm0, src); 2126 #else 2127 if (UseSSE >= 2) { 2128 movdbl(xmm0, src); 2129 } else { 2130 fld_d(src); 2131 } 2132 #endif // LP64 2133 } 2134 2135 void MacroAssembler::store_double(Address dst) { 2136 #ifdef _LP64 2137 movdbl(dst, xmm0); 2138 #else 2139 if (UseSSE >= 2) { 2140 movdbl(dst, xmm0); 2141 } else { 2142 fstp_d(dst); 2143 } 2144 #endif // LP64 2145 } 2146 2147 // dst = c = a * b + c 2148 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2149 Assembler::vfmadd231sd(c, a, b); 2150 if (dst != c) { 2151 movdbl(dst, c); 2152 } 2153 } 2154 2155 // dst = c = a * b + c 2156 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2157 Assembler::vfmadd231ss(c, a, b); 2158 if (dst != c) { 2159 movflt(dst, c); 2160 } 2161 } 2162 2163 // dst = c = a * b + c 2164 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2165 Assembler::vfmadd231pd(c, a, b, vector_len); 2166 if (dst != c) { 2167 vmovdqu(dst, c); 2168 } 2169 } 2170 2171 // dst = c = a * b + c 2172 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2173 Assembler::vfmadd231ps(c, a, b, vector_len); 2174 if (dst != c) { 2175 vmovdqu(dst, c); 2176 } 2177 } 2178 2179 // dst = c = a * b + c 2180 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2181 Assembler::vfmadd231pd(c, a, b, vector_len); 2182 if (dst != c) { 2183 vmovdqu(dst, c); 2184 } 2185 } 2186 2187 // dst = c = a * b + c 2188 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2189 Assembler::vfmadd231ps(c, a, b, vector_len); 2190 if (dst != c) { 2191 vmovdqu(dst, c); 2192 } 2193 } 2194 2195 void MacroAssembler::incrementl(AddressLiteral dst) { 2196 if (reachable(dst)) { 2197 incrementl(as_Address(dst)); 2198 } else { 2199 lea(rscratch1, dst); 2200 incrementl(Address(rscratch1, 0)); 2201 } 2202 } 2203 2204 void MacroAssembler::incrementl(ArrayAddress dst) { 2205 incrementl(as_Address(dst)); 2206 } 2207 2208 void MacroAssembler::incrementl(Register reg, int value) { 2209 if (value == min_jint) {addl(reg, value) ; return; } 2210 if (value < 0) { decrementl(reg, -value); return; } 2211 if (value == 0) { ; return; } 2212 if (value == 1 && UseIncDec) { incl(reg) ; return; } 2213 /* else */ { addl(reg, value) ; return; } 2214 } 2215 2216 void MacroAssembler::incrementl(Address dst, int value) { 2217 if (value == min_jint) {addl(dst, value) ; return; } 2218 if (value < 0) { decrementl(dst, -value); return; } 2219 if (value == 0) { ; return; } 2220 if (value == 1 && UseIncDec) { incl(dst) ; return; } 2221 /* else */ { addl(dst, value) ; return; } 2222 } 2223 2224 void MacroAssembler::jump(AddressLiteral dst) { 2225 if (reachable(dst)) { 2226 jmp_literal(dst.target(), dst.rspec()); 2227 } else { 2228 lea(rscratch1, dst); 2229 jmp(rscratch1); 2230 } 2231 } 2232 2233 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) { 2234 if (reachable(dst)) { 2235 InstructionMark im(this); 2236 relocate(dst.reloc()); 2237 const int short_size = 2; 2238 const int long_size = 6; 2239 int offs = (intptr_t)dst.target() - ((intptr_t)pc()); 2240 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) { 2241 // 0111 tttn #8-bit disp 2242 emit_int8(0x70 | cc); 2243 emit_int8((offs - short_size) & 0xFF); 2244 } else { 2245 // 0000 1111 1000 tttn #32-bit disp 2246 emit_int8(0x0F); 2247 emit_int8((unsigned char)(0x80 | cc)); 2248 emit_int32(offs - long_size); 2249 } 2250 } else { 2251 #ifdef ASSERT 2252 warning("reversing conditional branch"); 2253 #endif /* ASSERT */ 2254 Label skip; 2255 jccb(reverse[cc], skip); 2256 lea(rscratch1, dst); 2257 Assembler::jmp(rscratch1); 2258 bind(skip); 2259 } 2260 } 2261 2262 void MacroAssembler::fld_x(AddressLiteral src) { 2263 Assembler::fld_x(as_Address(src)); 2264 } 2265 2266 void MacroAssembler::ldmxcsr(AddressLiteral src) { 2267 if (reachable(src)) { 2268 Assembler::ldmxcsr(as_Address(src)); 2269 } else { 2270 lea(rscratch1, src); 2271 Assembler::ldmxcsr(Address(rscratch1, 0)); 2272 } 2273 } 2274 2275 int MacroAssembler::load_signed_byte(Register dst, Address src) { 2276 int off; 2277 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2278 off = offset(); 2279 movsbl(dst, src); // movsxb 2280 } else { 2281 off = load_unsigned_byte(dst, src); 2282 shll(dst, 24); 2283 sarl(dst, 24); 2284 } 2285 return off; 2286 } 2287 2288 // Note: load_signed_short used to be called load_signed_word. 2289 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler 2290 // manual, which means 16 bits, that usage is found nowhere in HotSpot code. 2291 // The term "word" in HotSpot means a 32- or 64-bit machine word. 2292 int MacroAssembler::load_signed_short(Register dst, Address src) { 2293 int off; 2294 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2295 // This is dubious to me since it seems safe to do a signed 16 => 64 bit 2296 // version but this is what 64bit has always done. This seems to imply 2297 // that users are only using 32bits worth. 2298 off = offset(); 2299 movswl(dst, src); // movsxw 2300 } else { 2301 off = load_unsigned_short(dst, src); 2302 shll(dst, 16); 2303 sarl(dst, 16); 2304 } 2305 return off; 2306 } 2307 2308 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 2309 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2310 // and "3.9 Partial Register Penalties", p. 22). 2311 int off; 2312 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) { 2313 off = offset(); 2314 movzbl(dst, src); // movzxb 2315 } else { 2316 xorl(dst, dst); 2317 off = offset(); 2318 movb(dst, src); 2319 } 2320 return off; 2321 } 2322 2323 // Note: load_unsigned_short used to be called load_unsigned_word. 2324 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 2325 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2326 // and "3.9 Partial Register Penalties", p. 22). 2327 int off; 2328 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) { 2329 off = offset(); 2330 movzwl(dst, src); // movzxw 2331 } else { 2332 xorl(dst, dst); 2333 off = offset(); 2334 movw(dst, src); 2335 } 2336 return off; 2337 } 2338 2339 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 2340 switch (size_in_bytes) { 2341 #ifndef _LP64 2342 case 8: 2343 assert(dst2 != noreg, "second dest register required"); 2344 movl(dst, src); 2345 movl(dst2, src.plus_disp(BytesPerInt)); 2346 break; 2347 #else 2348 case 8: movq(dst, src); break; 2349 #endif 2350 case 4: movl(dst, src); break; 2351 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 2352 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 2353 default: ShouldNotReachHere(); 2354 } 2355 } 2356 2357 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 2358 switch (size_in_bytes) { 2359 #ifndef _LP64 2360 case 8: 2361 assert(src2 != noreg, "second source register required"); 2362 movl(dst, src); 2363 movl(dst.plus_disp(BytesPerInt), src2); 2364 break; 2365 #else 2366 case 8: movq(dst, src); break; 2367 #endif 2368 case 4: movl(dst, src); break; 2369 case 2: movw(dst, src); break; 2370 case 1: movb(dst, src); break; 2371 default: ShouldNotReachHere(); 2372 } 2373 } 2374 2375 void MacroAssembler::mov32(AddressLiteral dst, Register src) { 2376 if (reachable(dst)) { 2377 movl(as_Address(dst), src); 2378 } else { 2379 lea(rscratch1, dst); 2380 movl(Address(rscratch1, 0), src); 2381 } 2382 } 2383 2384 void MacroAssembler::mov32(Register dst, AddressLiteral src) { 2385 if (reachable(src)) { 2386 movl(dst, as_Address(src)); 2387 } else { 2388 lea(rscratch1, src); 2389 movl(dst, Address(rscratch1, 0)); 2390 } 2391 } 2392 2393 // C++ bool manipulation 2394 2395 void MacroAssembler::movbool(Register dst, Address src) { 2396 if(sizeof(bool) == 1) 2397 movb(dst, src); 2398 else if(sizeof(bool) == 2) 2399 movw(dst, src); 2400 else if(sizeof(bool) == 4) 2401 movl(dst, src); 2402 else 2403 // unsupported 2404 ShouldNotReachHere(); 2405 } 2406 2407 void MacroAssembler::movbool(Address dst, bool boolconst) { 2408 if(sizeof(bool) == 1) 2409 movb(dst, (int) boolconst); 2410 else if(sizeof(bool) == 2) 2411 movw(dst, (int) boolconst); 2412 else if(sizeof(bool) == 4) 2413 movl(dst, (int) boolconst); 2414 else 2415 // unsupported 2416 ShouldNotReachHere(); 2417 } 2418 2419 void MacroAssembler::movbool(Address dst, Register src) { 2420 if(sizeof(bool) == 1) 2421 movb(dst, src); 2422 else if(sizeof(bool) == 2) 2423 movw(dst, src); 2424 else if(sizeof(bool) == 4) 2425 movl(dst, src); 2426 else 2427 // unsupported 2428 ShouldNotReachHere(); 2429 } 2430 2431 void MacroAssembler::movbyte(ArrayAddress dst, int src) { 2432 movb(as_Address(dst), src); 2433 } 2434 2435 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) { 2436 if (reachable(src)) { 2437 movdl(dst, as_Address(src)); 2438 } else { 2439 lea(rscratch1, src); 2440 movdl(dst, Address(rscratch1, 0)); 2441 } 2442 } 2443 2444 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) { 2445 if (reachable(src)) { 2446 movq(dst, as_Address(src)); 2447 } else { 2448 lea(rscratch1, src); 2449 movq(dst, Address(rscratch1, 0)); 2450 } 2451 } 2452 2453 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) { 2454 if (reachable(src)) { 2455 if (UseXmmLoadAndClearUpper) { 2456 movsd (dst, as_Address(src)); 2457 } else { 2458 movlpd(dst, as_Address(src)); 2459 } 2460 } else { 2461 lea(rscratch1, src); 2462 if (UseXmmLoadAndClearUpper) { 2463 movsd (dst, Address(rscratch1, 0)); 2464 } else { 2465 movlpd(dst, Address(rscratch1, 0)); 2466 } 2467 } 2468 } 2469 2470 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) { 2471 if (reachable(src)) { 2472 movss(dst, as_Address(src)); 2473 } else { 2474 lea(rscratch1, src); 2475 movss(dst, Address(rscratch1, 0)); 2476 } 2477 } 2478 2479 void MacroAssembler::movptr(Register dst, Register src) { 2480 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2481 } 2482 2483 void MacroAssembler::movptr(Register dst, Address src) { 2484 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2485 } 2486 2487 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 2488 void MacroAssembler::movptr(Register dst, intptr_t src) { 2489 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src)); 2490 } 2491 2492 void MacroAssembler::movptr(Address dst, Register src) { 2493 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2494 } 2495 2496 void MacroAssembler::movdqu(Address dst, XMMRegister src) { 2497 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2498 Assembler::movdqu(dst, src); 2499 } 2500 2501 void MacroAssembler::movdqu(XMMRegister dst, Address src) { 2502 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2503 Assembler::movdqu(dst, src); 2504 } 2505 2506 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) { 2507 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2508 Assembler::movdqu(dst, src); 2509 } 2510 2511 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) { 2512 if (reachable(src)) { 2513 movdqu(dst, as_Address(src)); 2514 } else { 2515 lea(scratchReg, src); 2516 movdqu(dst, Address(scratchReg, 0)); 2517 } 2518 } 2519 2520 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) { 2521 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2522 Assembler::vmovdqu(dst, src); 2523 } 2524 2525 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) { 2526 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2527 Assembler::vmovdqu(dst, src); 2528 } 2529 2530 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) { 2531 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2532 Assembler::vmovdqu(dst, src); 2533 } 2534 2535 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 2536 if (reachable(src)) { 2537 vmovdqu(dst, as_Address(src)); 2538 } 2539 else { 2540 lea(scratch_reg, src); 2541 vmovdqu(dst, Address(scratch_reg, 0)); 2542 } 2543 } 2544 2545 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len) { 2546 assert(vector_len <= AVX_256bit, "AVX2 vector length"); 2547 if (vector_len == AVX_256bit) { 2548 vmovdqu(dst, src, scratch_reg); 2549 } else { 2550 movdqu(dst, src, scratch_reg); 2551 } 2552 } 2553 2554 void MacroAssembler::kmov(KRegister dst, Address src) { 2555 if (VM_Version::supports_avx512bw()) { 2556 kmovql(dst, src); 2557 } else { 2558 assert(VM_Version::supports_evex(), ""); 2559 kmovwl(dst, src); 2560 } 2561 } 2562 2563 void MacroAssembler::kmov(Address dst, KRegister src) { 2564 if (VM_Version::supports_avx512bw()) { 2565 kmovql(dst, src); 2566 } else { 2567 assert(VM_Version::supports_evex(), ""); 2568 kmovwl(dst, src); 2569 } 2570 } 2571 2572 void MacroAssembler::kmov(KRegister dst, KRegister src) { 2573 if (VM_Version::supports_avx512bw()) { 2574 kmovql(dst, src); 2575 } else { 2576 assert(VM_Version::supports_evex(), ""); 2577 kmovwl(dst, src); 2578 } 2579 } 2580 2581 void MacroAssembler::kmov(Register dst, KRegister src) { 2582 if (VM_Version::supports_avx512bw()) { 2583 kmovql(dst, src); 2584 } else { 2585 assert(VM_Version::supports_evex(), ""); 2586 kmovwl(dst, src); 2587 } 2588 } 2589 2590 void MacroAssembler::kmov(KRegister dst, Register src) { 2591 if (VM_Version::supports_avx512bw()) { 2592 kmovql(dst, src); 2593 } else { 2594 assert(VM_Version::supports_evex(), ""); 2595 kmovwl(dst, src); 2596 } 2597 } 2598 2599 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) { 2600 if (reachable(src)) { 2601 kmovql(dst, as_Address(src)); 2602 } else { 2603 lea(scratch_reg, src); 2604 kmovql(dst, Address(scratch_reg, 0)); 2605 } 2606 } 2607 2608 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) { 2609 if (reachable(src)) { 2610 kmovwl(dst, as_Address(src)); 2611 } else { 2612 lea(scratch_reg, src); 2613 kmovwl(dst, Address(scratch_reg, 0)); 2614 } 2615 } 2616 2617 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2618 int vector_len, Register scratch_reg) { 2619 if (reachable(src)) { 2620 if (mask == k0) { 2621 Assembler::evmovdqub(dst, as_Address(src), merge, vector_len); 2622 } else { 2623 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len); 2624 } 2625 } else { 2626 lea(scratch_reg, src); 2627 if (mask == k0) { 2628 Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len); 2629 } else { 2630 Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len); 2631 } 2632 } 2633 } 2634 2635 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2636 int vector_len, Register scratch_reg) { 2637 if (reachable(src)) { 2638 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len); 2639 } else { 2640 lea(scratch_reg, src); 2641 Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len); 2642 } 2643 } 2644 2645 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2646 int vector_len, Register scratch_reg) { 2647 if (reachable(src)) { 2648 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len); 2649 } else { 2650 lea(scratch_reg, src); 2651 Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len); 2652 } 2653 } 2654 2655 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2656 int vector_len, Register scratch_reg) { 2657 if (reachable(src)) { 2658 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len); 2659 } else { 2660 lea(scratch_reg, src); 2661 Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len); 2662 } 2663 } 2664 2665 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2666 if (reachable(src)) { 2667 Assembler::evmovdquq(dst, as_Address(src), vector_len); 2668 } else { 2669 lea(rscratch, src); 2670 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len); 2671 } 2672 } 2673 2674 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) { 2675 if (reachable(src)) { 2676 Assembler::movdqa(dst, as_Address(src)); 2677 } else { 2678 lea(rscratch1, src); 2679 Assembler::movdqa(dst, Address(rscratch1, 0)); 2680 } 2681 } 2682 2683 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { 2684 if (reachable(src)) { 2685 Assembler::movsd(dst, as_Address(src)); 2686 } else { 2687 lea(rscratch1, src); 2688 Assembler::movsd(dst, Address(rscratch1, 0)); 2689 } 2690 } 2691 2692 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) { 2693 if (reachable(src)) { 2694 Assembler::movss(dst, as_Address(src)); 2695 } else { 2696 lea(rscratch1, src); 2697 Assembler::movss(dst, Address(rscratch1, 0)); 2698 } 2699 } 2700 2701 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2702 if (reachable(src)) { 2703 Assembler::vmovddup(dst, as_Address(src), vector_len); 2704 } else { 2705 lea(rscratch, src); 2706 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len); 2707 } 2708 } 2709 2710 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) { 2711 if (reachable(src)) { 2712 Assembler::mulsd(dst, as_Address(src)); 2713 } else { 2714 lea(rscratch1, src); 2715 Assembler::mulsd(dst, Address(rscratch1, 0)); 2716 } 2717 } 2718 2719 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) { 2720 if (reachable(src)) { 2721 Assembler::mulss(dst, as_Address(src)); 2722 } else { 2723 lea(rscratch1, src); 2724 Assembler::mulss(dst, Address(rscratch1, 0)); 2725 } 2726 } 2727 2728 void MacroAssembler::null_check(Register reg, int offset) { 2729 if (needs_explicit_null_check(offset)) { 2730 // provoke OS NULL exception if reg = NULL by 2731 // accessing M[reg] w/o changing any (non-CC) registers 2732 // NOTE: cmpl is plenty here to provoke a segv 2733 cmpptr(rax, Address(reg, 0)); 2734 // Note: should probably use testl(rax, Address(reg, 0)); 2735 // may be shorter code (however, this version of 2736 // testl needs to be implemented first) 2737 } else { 2738 // nothing to do, (later) access of M[reg + offset] 2739 // will provoke OS NULL exception if reg = NULL 2740 } 2741 } 2742 2743 void MacroAssembler::test_markword_is_inline_type(Register markword, Label& is_inline_type) { 2744 andptr(markword, markWord::inline_type_mask_in_place); 2745 cmpptr(markword, markWord::inline_type_pattern); 2746 jcc(Assembler::equal, is_inline_type); 2747 } 2748 2749 void MacroAssembler::test_klass_is_inline_type(Register klass, Register temp_reg, Label& is_inline_type) { 2750 movl(temp_reg, Address(klass, Klass::access_flags_offset())); 2751 testl(temp_reg, JVM_ACC_VALUE); 2752 jcc(Assembler::notZero, is_inline_type); 2753 } 2754 2755 void MacroAssembler::test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type) { 2756 testptr(object, object); 2757 jcc(Assembler::zero, not_inline_type); 2758 const int is_inline_type_mask = markWord::inline_type_pattern; 2759 movptr(tmp, Address(object, oopDesc::mark_offset_in_bytes())); 2760 andptr(tmp, is_inline_type_mask); 2761 cmpptr(tmp, is_inline_type_mask); 2762 jcc(Assembler::notEqual, not_inline_type); 2763 } 2764 2765 void MacroAssembler::test_klass_is_empty_inline_type(Register klass, Register temp_reg, Label& is_empty_inline_type) { 2766 #ifdef ASSERT 2767 { 2768 Label done_check; 2769 test_klass_is_inline_type(klass, temp_reg, done_check); 2770 stop("test_klass_is_empty_inline_type with non inline type klass"); 2771 bind(done_check); 2772 } 2773 #endif 2774 movl(temp_reg, Address(klass, InstanceKlass::misc_flags_offset())); 2775 testl(temp_reg, InstanceKlass::misc_flag_is_empty_inline_type()); 2776 jcc(Assembler::notZero, is_empty_inline_type); 2777 } 2778 2779 void MacroAssembler::test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free_inline_type) { 2780 movl(temp_reg, flags); 2781 shrl(temp_reg, ConstantPoolCacheEntry::is_null_free_inline_type_shift); 2782 andl(temp_reg, 0x1); 2783 testl(temp_reg, temp_reg); 2784 jcc(Assembler::notZero, is_null_free_inline_type); 2785 } 2786 2787 void MacroAssembler::test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free_inline_type) { 2788 movl(temp_reg, flags); 2789 shrl(temp_reg, ConstantPoolCacheEntry::is_null_free_inline_type_shift); 2790 andl(temp_reg, 0x1); 2791 testl(temp_reg, temp_reg); 2792 jcc(Assembler::zero, not_null_free_inline_type); 2793 } 2794 2795 void MacroAssembler::test_field_is_inlined(Register flags, Register temp_reg, Label& is_inlined) { 2796 movl(temp_reg, flags); 2797 shrl(temp_reg, ConstantPoolCacheEntry::is_inlined_shift); 2798 andl(temp_reg, 0x1); 2799 testl(temp_reg, temp_reg); 2800 jcc(Assembler::notZero, is_inlined); 2801 } 2802 2803 void MacroAssembler::test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set, Label& jmp_label) { 2804 Label test_mark_word; 2805 // load mark word 2806 movptr(temp_reg, Address(oop, oopDesc::mark_offset_in_bytes())); 2807 // check displaced 2808 testl(temp_reg, markWord::unlocked_value); 2809 jccb(Assembler::notZero, test_mark_word); 2810 // slow path use klass prototype 2811 push(rscratch1); 2812 load_prototype_header(temp_reg, oop, rscratch1); 2813 pop(rscratch1); 2814 2815 bind(test_mark_word); 2816 testl(temp_reg, test_bit); 2817 jcc((jmp_set) ? Assembler::notZero : Assembler::zero, jmp_label); 2818 } 2819 2820 void MacroAssembler::test_flattened_array_oop(Register oop, Register temp_reg, 2821 Label&is_flattened_array) { 2822 #ifdef _LP64 2823 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, true, is_flattened_array); 2824 #else 2825 load_klass(temp_reg, oop, noreg); 2826 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset())); 2827 test_flattened_array_layout(temp_reg, is_flattened_array); 2828 #endif 2829 } 2830 2831 void MacroAssembler::test_non_flattened_array_oop(Register oop, Register temp_reg, 2832 Label&is_non_flattened_array) { 2833 #ifdef _LP64 2834 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, false, is_non_flattened_array); 2835 #else 2836 load_klass(temp_reg, oop, noreg); 2837 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset())); 2838 test_non_flattened_array_layout(temp_reg, is_non_flattened_array); 2839 #endif 2840 } 2841 2842 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label&is_null_free_array) { 2843 #ifdef _LP64 2844 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, true, is_null_free_array); 2845 #else 2846 load_klass(temp_reg, oop, noreg); 2847 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset())); 2848 test_null_free_array_layout(temp_reg, is_null_free_array); 2849 #endif 2850 } 2851 2852 void MacroAssembler::test_non_null_free_array_oop(Register oop, Register temp_reg, Label&is_non_null_free_array) { 2853 #ifdef _LP64 2854 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, false, is_non_null_free_array); 2855 #else 2856 load_klass(temp_reg, oop, noreg); 2857 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset())); 2858 test_non_null_free_array_layout(temp_reg, is_non_null_free_array); 2859 #endif 2860 } 2861 2862 void MacroAssembler::test_flattened_array_layout(Register lh, Label& is_flattened_array) { 2863 testl(lh, Klass::_lh_array_tag_flat_value_bit_inplace); 2864 jcc(Assembler::notZero, is_flattened_array); 2865 } 2866 2867 void MacroAssembler::test_non_flattened_array_layout(Register lh, Label& is_non_flattened_array) { 2868 testl(lh, Klass::_lh_array_tag_flat_value_bit_inplace); 2869 jcc(Assembler::zero, is_non_flattened_array); 2870 } 2871 2872 void MacroAssembler::test_null_free_array_layout(Register lh, Label& is_null_free_array) { 2873 testl(lh, Klass::_lh_null_free_array_bit_inplace); 2874 jcc(Assembler::notZero, is_null_free_array); 2875 } 2876 2877 void MacroAssembler::test_non_null_free_array_layout(Register lh, Label& is_non_null_free_array) { 2878 testl(lh, Klass::_lh_null_free_array_bit_inplace); 2879 jcc(Assembler::zero, is_non_null_free_array); 2880 } 2881 2882 2883 void MacroAssembler::os_breakpoint() { 2884 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability 2885 // (e.g., MSVC can't call ps() otherwise) 2886 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint))); 2887 } 2888 2889 void MacroAssembler::unimplemented(const char* what) { 2890 const char* buf = NULL; 2891 { 2892 ResourceMark rm; 2893 stringStream ss; 2894 ss.print("unimplemented: %s", what); 2895 buf = code_string(ss.as_string()); 2896 } 2897 stop(buf); 2898 } 2899 2900 #ifdef _LP64 2901 #define XSTATE_BV 0x200 2902 #endif 2903 2904 void MacroAssembler::pop_CPU_state() { 2905 pop_FPU_state(); 2906 pop_IU_state(); 2907 } 2908 2909 void MacroAssembler::pop_FPU_state() { 2910 #ifndef _LP64 2911 frstor(Address(rsp, 0)); 2912 #else 2913 fxrstor(Address(rsp, 0)); 2914 #endif 2915 addptr(rsp, FPUStateSizeInWords * wordSize); 2916 } 2917 2918 void MacroAssembler::pop_IU_state() { 2919 popa(); 2920 LP64_ONLY(addq(rsp, 8)); 2921 popf(); 2922 } 2923 2924 // Save Integer and Float state 2925 // Warning: Stack must be 16 byte aligned (64bit) 2926 void MacroAssembler::push_CPU_state() { 2927 push_IU_state(); 2928 push_FPU_state(); 2929 } 2930 2931 void MacroAssembler::push_FPU_state() { 2932 subptr(rsp, FPUStateSizeInWords * wordSize); 2933 #ifndef _LP64 2934 fnsave(Address(rsp, 0)); 2935 fwait(); 2936 #else 2937 fxsave(Address(rsp, 0)); 2938 #endif // LP64 2939 } 2940 2941 void MacroAssembler::push_IU_state() { 2942 // Push flags first because pusha kills them 2943 pushf(); 2944 // Make sure rsp stays 16-byte aligned 2945 LP64_ONLY(subq(rsp, 8)); 2946 pusha(); 2947 } 2948 2949 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register 2950 if (!java_thread->is_valid()) { 2951 java_thread = rdi; 2952 get_thread(java_thread); 2953 } 2954 // we must set sp to zero to clear frame 2955 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); 2956 // must clear fp, so that compiled frames are not confused; it is 2957 // possible that we need it only for debugging 2958 if (clear_fp) { 2959 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2960 } 2961 // Always clear the pc because it could have been set by make_walkable() 2962 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); 2963 vzeroupper(); 2964 } 2965 2966 void MacroAssembler::restore_rax(Register tmp) { 2967 if (tmp == noreg) pop(rax); 2968 else if (tmp != rax) mov(rax, tmp); 2969 } 2970 2971 void MacroAssembler::round_to(Register reg, int modulus) { 2972 addptr(reg, modulus - 1); 2973 andptr(reg, -modulus); 2974 } 2975 2976 void MacroAssembler::save_rax(Register tmp) { 2977 if (tmp == noreg) push(rax); 2978 else if (tmp != rax) mov(tmp, rax); 2979 } 2980 2981 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) { 2982 if (at_return) { 2983 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore, 2984 // we may safely use rsp instead to perform the stack watermark check. 2985 cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset())); 2986 jcc(Assembler::above, slow_path); 2987 return; 2988 } 2989 testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit()); 2990 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll 2991 } 2992 2993 // Calls to C land 2994 // 2995 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded 2996 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 2997 // has to be reset to 0. This is required to allow proper stack traversal. 2998 void MacroAssembler::set_last_Java_frame(Register java_thread, 2999 Register last_java_sp, 3000 Register last_java_fp, 3001 address last_java_pc) { 3002 vzeroupper(); 3003 // determine java_thread register 3004 if (!java_thread->is_valid()) { 3005 java_thread = rdi; 3006 get_thread(java_thread); 3007 } 3008 // determine last_java_sp register 3009 if (!last_java_sp->is_valid()) { 3010 last_java_sp = rsp; 3011 } 3012 3013 // last_java_fp is optional 3014 3015 if (last_java_fp->is_valid()) { 3016 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp); 3017 } 3018 3019 // last_java_pc is optional 3020 3021 if (last_java_pc != NULL) { 3022 lea(Address(java_thread, 3023 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()), 3024 InternalAddress(last_java_pc)); 3025 3026 } 3027 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp); 3028 } 3029 3030 void MacroAssembler::shlptr(Register dst, int imm8) { 3031 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8)); 3032 } 3033 3034 void MacroAssembler::shrptr(Register dst, int imm8) { 3035 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8)); 3036 } 3037 3038 void MacroAssembler::sign_extend_byte(Register reg) { 3039 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) { 3040 movsbl(reg, reg); // movsxb 3041 } else { 3042 shll(reg, 24); 3043 sarl(reg, 24); 3044 } 3045 } 3046 3047 void MacroAssembler::sign_extend_short(Register reg) { 3048 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 3049 movswl(reg, reg); // movsxw 3050 } else { 3051 shll(reg, 16); 3052 sarl(reg, 16); 3053 } 3054 } 3055 3056 void MacroAssembler::testl(Register dst, AddressLiteral src) { 3057 assert(reachable(src), "Address should be reachable"); 3058 testl(dst, as_Address(src)); 3059 } 3060 3061 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) { 3062 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3063 Assembler::pcmpeqb(dst, src); 3064 } 3065 3066 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) { 3067 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3068 Assembler::pcmpeqw(dst, src); 3069 } 3070 3071 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) { 3072 assert((dst->encoding() < 16),"XMM register should be 0-15"); 3073 Assembler::pcmpestri(dst, src, imm8); 3074 } 3075 3076 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { 3077 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3078 Assembler::pcmpestri(dst, src, imm8); 3079 } 3080 3081 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) { 3082 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3083 Assembler::pmovzxbw(dst, src); 3084 } 3085 3086 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) { 3087 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3088 Assembler::pmovzxbw(dst, src); 3089 } 3090 3091 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) { 3092 assert((src->encoding() < 16),"XMM register should be 0-15"); 3093 Assembler::pmovmskb(dst, src); 3094 } 3095 3096 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) { 3097 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3098 Assembler::ptest(dst, src); 3099 } 3100 3101 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) { 3102 if (reachable(src)) { 3103 Assembler::sqrtsd(dst, as_Address(src)); 3104 } else { 3105 lea(rscratch1, src); 3106 Assembler::sqrtsd(dst, Address(rscratch1, 0)); 3107 } 3108 } 3109 3110 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) { 3111 if (reachable(src)) { 3112 Assembler::sqrtss(dst, as_Address(src)); 3113 } else { 3114 lea(rscratch1, src); 3115 Assembler::sqrtss(dst, Address(rscratch1, 0)); 3116 } 3117 } 3118 3119 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) { 3120 if (reachable(src)) { 3121 Assembler::subsd(dst, as_Address(src)); 3122 } else { 3123 lea(rscratch1, src); 3124 Assembler::subsd(dst, Address(rscratch1, 0)); 3125 } 3126 } 3127 3128 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) { 3129 if (reachable(src)) { 3130 Assembler::roundsd(dst, as_Address(src), rmode); 3131 } else { 3132 lea(scratch_reg, src); 3133 Assembler::roundsd(dst, Address(scratch_reg, 0), rmode); 3134 } 3135 } 3136 3137 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) { 3138 if (reachable(src)) { 3139 Assembler::subss(dst, as_Address(src)); 3140 } else { 3141 lea(rscratch1, src); 3142 Assembler::subss(dst, Address(rscratch1, 0)); 3143 } 3144 } 3145 3146 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) { 3147 if (reachable(src)) { 3148 Assembler::ucomisd(dst, as_Address(src)); 3149 } else { 3150 lea(rscratch1, src); 3151 Assembler::ucomisd(dst, Address(rscratch1, 0)); 3152 } 3153 } 3154 3155 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) { 3156 if (reachable(src)) { 3157 Assembler::ucomiss(dst, as_Address(src)); 3158 } else { 3159 lea(rscratch1, src); 3160 Assembler::ucomiss(dst, Address(rscratch1, 0)); 3161 } 3162 } 3163 3164 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 3165 // Used in sign-bit flipping with aligned address. 3166 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3167 if (reachable(src)) { 3168 Assembler::xorpd(dst, as_Address(src)); 3169 } else { 3170 lea(scratch_reg, src); 3171 Assembler::xorpd(dst, Address(scratch_reg, 0)); 3172 } 3173 } 3174 3175 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) { 3176 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) { 3177 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 3178 } 3179 else { 3180 Assembler::xorpd(dst, src); 3181 } 3182 } 3183 3184 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) { 3185 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) { 3186 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 3187 } else { 3188 Assembler::xorps(dst, src); 3189 } 3190 } 3191 3192 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 3193 // Used in sign-bit flipping with aligned address. 3194 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3195 if (reachable(src)) { 3196 Assembler::xorps(dst, as_Address(src)); 3197 } else { 3198 lea(scratch_reg, src); 3199 Assembler::xorps(dst, Address(scratch_reg, 0)); 3200 } 3201 } 3202 3203 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) { 3204 // Used in sign-bit flipping with aligned address. 3205 bool aligned_adr = (((intptr_t)src.target() & 15) == 0); 3206 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes"); 3207 if (reachable(src)) { 3208 Assembler::pshufb(dst, as_Address(src)); 3209 } else { 3210 lea(rscratch1, src); 3211 Assembler::pshufb(dst, Address(rscratch1, 0)); 3212 } 3213 } 3214 3215 // AVX 3-operands instructions 3216 3217 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3218 if (reachable(src)) { 3219 vaddsd(dst, nds, as_Address(src)); 3220 } else { 3221 lea(rscratch1, src); 3222 vaddsd(dst, nds, Address(rscratch1, 0)); 3223 } 3224 } 3225 3226 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3227 if (reachable(src)) { 3228 vaddss(dst, nds, as_Address(src)); 3229 } else { 3230 lea(rscratch1, src); 3231 vaddss(dst, nds, Address(rscratch1, 0)); 3232 } 3233 } 3234 3235 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3236 assert(UseAVX > 0, "requires some form of AVX"); 3237 if (reachable(src)) { 3238 Assembler::vpaddb(dst, nds, as_Address(src), vector_len); 3239 } else { 3240 lea(rscratch, src); 3241 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len); 3242 } 3243 } 3244 3245 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3246 assert(UseAVX > 0, "requires some form of AVX"); 3247 if (reachable(src)) { 3248 Assembler::vpaddd(dst, nds, as_Address(src), vector_len); 3249 } else { 3250 lea(rscratch, src); 3251 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len); 3252 } 3253 } 3254 3255 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) { 3256 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3257 vandps(dst, nds, negate_field, vector_len); 3258 } 3259 3260 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) { 3261 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3262 vandpd(dst, nds, negate_field, vector_len); 3263 } 3264 3265 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3266 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3267 Assembler::vpaddb(dst, nds, src, vector_len); 3268 } 3269 3270 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3271 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3272 Assembler::vpaddb(dst, nds, src, vector_len); 3273 } 3274 3275 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3276 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3277 Assembler::vpaddw(dst, nds, src, vector_len); 3278 } 3279 3280 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3281 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3282 Assembler::vpaddw(dst, nds, src, vector_len); 3283 } 3284 3285 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3286 if (reachable(src)) { 3287 Assembler::vpand(dst, nds, as_Address(src), vector_len); 3288 } else { 3289 lea(scratch_reg, src); 3290 Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len); 3291 } 3292 } 3293 3294 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) { 3295 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3296 Assembler::vpbroadcastw(dst, src, vector_len); 3297 } 3298 3299 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3300 if (reachable(src)) { 3301 Assembler::vbroadcastsd(dst, as_Address(src), vector_len); 3302 } else { 3303 lea(rscratch, src); 3304 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len); 3305 } 3306 } 3307 3308 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3309 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3310 Assembler::vpcmpeqb(dst, nds, src, vector_len); 3311 } 3312 3313 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3314 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3315 Assembler::vpcmpeqw(dst, nds, src, vector_len); 3316 } 3317 3318 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, 3319 AddressLiteral src, int vector_len, Register scratch_reg) { 3320 if (reachable(src)) { 3321 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len); 3322 } else { 3323 lea(scratch_reg, src); 3324 Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len); 3325 } 3326 } 3327 3328 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3329 int comparison, bool is_signed, int vector_len, Register scratch_reg) { 3330 if (reachable(src)) { 3331 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3332 } else { 3333 lea(scratch_reg, src); 3334 Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len); 3335 } 3336 } 3337 3338 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3339 int comparison, bool is_signed, int vector_len, Register scratch_reg) { 3340 if (reachable(src)) { 3341 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3342 } else { 3343 lea(scratch_reg, src); 3344 Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len); 3345 } 3346 } 3347 3348 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3349 int comparison, bool is_signed, int vector_len, Register scratch_reg) { 3350 if (reachable(src)) { 3351 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3352 } else { 3353 lea(scratch_reg, src); 3354 Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len); 3355 } 3356 } 3357 3358 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3359 int comparison, bool is_signed, int vector_len, Register scratch_reg) { 3360 if (reachable(src)) { 3361 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3362 } else { 3363 lea(scratch_reg, src); 3364 Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len); 3365 } 3366 } 3367 3368 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) { 3369 if (width == Assembler::Q) { 3370 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len); 3371 } else { 3372 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len); 3373 } 3374 } 3375 3376 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) { 3377 int eq_cond_enc = 0x29; 3378 int gt_cond_enc = 0x37; 3379 if (width != Assembler::Q) { 3380 eq_cond_enc = 0x74 + width; 3381 gt_cond_enc = 0x64 + width; 3382 } 3383 switch (cond) { 3384 case eq: 3385 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); 3386 break; 3387 case neq: 3388 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); 3389 vallones(xtmp, vector_len); 3390 vpxor(dst, xtmp, dst, vector_len); 3391 break; 3392 case le: 3393 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); 3394 vallones(xtmp, vector_len); 3395 vpxor(dst, xtmp, dst, vector_len); 3396 break; 3397 case nlt: 3398 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); 3399 vallones(xtmp, vector_len); 3400 vpxor(dst, xtmp, dst, vector_len); 3401 break; 3402 case lt: 3403 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); 3404 break; 3405 case nle: 3406 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); 3407 break; 3408 default: 3409 assert(false, "Should not reach here"); 3410 } 3411 } 3412 3413 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { 3414 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3415 Assembler::vpmovzxbw(dst, src, vector_len); 3416 } 3417 3418 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) { 3419 assert((src->encoding() < 16),"XMM register should be 0-15"); 3420 Assembler::vpmovmskb(dst, src, vector_len); 3421 } 3422 3423 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3424 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3425 Assembler::vpmullw(dst, nds, src, vector_len); 3426 } 3427 3428 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3429 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3430 Assembler::vpmullw(dst, nds, src, vector_len); 3431 } 3432 3433 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3434 assert((UseAVX > 0), "AVX support is needed"); 3435 if (reachable(src)) { 3436 Assembler::vpmulld(dst, nds, as_Address(src), vector_len); 3437 } else { 3438 lea(scratch_reg, src); 3439 Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len); 3440 } 3441 } 3442 3443 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3444 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3445 Assembler::vpsubb(dst, nds, src, vector_len); 3446 } 3447 3448 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3449 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3450 Assembler::vpsubb(dst, nds, src, vector_len); 3451 } 3452 3453 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3454 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3455 Assembler::vpsubw(dst, nds, src, vector_len); 3456 } 3457 3458 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3459 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3460 Assembler::vpsubw(dst, nds, src, vector_len); 3461 } 3462 3463 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3464 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3465 Assembler::vpsraw(dst, nds, shift, vector_len); 3466 } 3467 3468 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3469 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3470 Assembler::vpsraw(dst, nds, shift, vector_len); 3471 } 3472 3473 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3474 assert(UseAVX > 2,""); 3475 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 3476 vector_len = 2; 3477 } 3478 Assembler::evpsraq(dst, nds, shift, vector_len); 3479 } 3480 3481 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3482 assert(UseAVX > 2,""); 3483 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 3484 vector_len = 2; 3485 } 3486 Assembler::evpsraq(dst, nds, shift, vector_len); 3487 } 3488 3489 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3490 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3491 Assembler::vpsrlw(dst, nds, shift, vector_len); 3492 } 3493 3494 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3495 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3496 Assembler::vpsrlw(dst, nds, shift, vector_len); 3497 } 3498 3499 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3500 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3501 Assembler::vpsllw(dst, nds, shift, vector_len); 3502 } 3503 3504 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3505 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3506 Assembler::vpsllw(dst, nds, shift, vector_len); 3507 } 3508 3509 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) { 3510 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3511 Assembler::vptest(dst, src); 3512 } 3513 3514 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) { 3515 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3516 Assembler::punpcklbw(dst, src); 3517 } 3518 3519 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) { 3520 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 3521 Assembler::pshufd(dst, src, mode); 3522 } 3523 3524 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { 3525 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3526 Assembler::pshuflw(dst, src, mode); 3527 } 3528 3529 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3530 if (reachable(src)) { 3531 vandpd(dst, nds, as_Address(src), vector_len); 3532 } else { 3533 lea(scratch_reg, src); 3534 vandpd(dst, nds, Address(scratch_reg, 0), vector_len); 3535 } 3536 } 3537 3538 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3539 if (reachable(src)) { 3540 vandps(dst, nds, as_Address(src), vector_len); 3541 } else { 3542 lea(scratch_reg, src); 3543 vandps(dst, nds, Address(scratch_reg, 0), vector_len); 3544 } 3545 } 3546 3547 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, 3548 bool merge, int vector_len, Register scratch_reg) { 3549 if (reachable(src)) { 3550 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len); 3551 } else { 3552 lea(scratch_reg, src); 3553 Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len); 3554 } 3555 } 3556 3557 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3558 if (reachable(src)) { 3559 vdivsd(dst, nds, as_Address(src)); 3560 } else { 3561 lea(rscratch1, src); 3562 vdivsd(dst, nds, Address(rscratch1, 0)); 3563 } 3564 } 3565 3566 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3567 if (reachable(src)) { 3568 vdivss(dst, nds, as_Address(src)); 3569 } else { 3570 lea(rscratch1, src); 3571 vdivss(dst, nds, Address(rscratch1, 0)); 3572 } 3573 } 3574 3575 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3576 if (reachable(src)) { 3577 vmulsd(dst, nds, as_Address(src)); 3578 } else { 3579 lea(rscratch1, src); 3580 vmulsd(dst, nds, Address(rscratch1, 0)); 3581 } 3582 } 3583 3584 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3585 if (reachable(src)) { 3586 vmulss(dst, nds, as_Address(src)); 3587 } else { 3588 lea(rscratch1, src); 3589 vmulss(dst, nds, Address(rscratch1, 0)); 3590 } 3591 } 3592 3593 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3594 if (reachable(src)) { 3595 vsubsd(dst, nds, as_Address(src)); 3596 } else { 3597 lea(rscratch1, src); 3598 vsubsd(dst, nds, Address(rscratch1, 0)); 3599 } 3600 } 3601 3602 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3603 if (reachable(src)) { 3604 vsubss(dst, nds, as_Address(src)); 3605 } else { 3606 lea(rscratch1, src); 3607 vsubss(dst, nds, Address(rscratch1, 0)); 3608 } 3609 } 3610 3611 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3612 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3613 vxorps(dst, nds, src, Assembler::AVX_128bit); 3614 } 3615 3616 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3617 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3618 vxorpd(dst, nds, src, Assembler::AVX_128bit); 3619 } 3620 3621 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3622 if (reachable(src)) { 3623 vxorpd(dst, nds, as_Address(src), vector_len); 3624 } else { 3625 lea(scratch_reg, src); 3626 vxorpd(dst, nds, Address(scratch_reg, 0), vector_len); 3627 } 3628 } 3629 3630 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3631 if (reachable(src)) { 3632 vxorps(dst, nds, as_Address(src), vector_len); 3633 } else { 3634 lea(scratch_reg, src); 3635 vxorps(dst, nds, Address(scratch_reg, 0), vector_len); 3636 } 3637 } 3638 3639 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3640 if (UseAVX > 1 || (vector_len < 1)) { 3641 if (reachable(src)) { 3642 Assembler::vpxor(dst, nds, as_Address(src), vector_len); 3643 } else { 3644 lea(scratch_reg, src); 3645 Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len); 3646 } 3647 } 3648 else { 3649 MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg); 3650 } 3651 } 3652 3653 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3654 if (reachable(src)) { 3655 Assembler::vpermd(dst, nds, as_Address(src), vector_len); 3656 } else { 3657 lea(scratch_reg, src); 3658 Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len); 3659 } 3660 } 3661 3662 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) { 3663 const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask); 3664 STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code 3665 // The inverted mask is sign-extended 3666 andptr(possibly_jweak, inverted_jweak_mask); 3667 } 3668 3669 void MacroAssembler::resolve_jobject(Register value, 3670 Register thread, 3671 Register tmp) { 3672 assert_different_registers(value, thread, tmp); 3673 Label done, not_weak; 3674 testptr(value, value); 3675 jcc(Assembler::zero, done); // Use NULL as-is. 3676 testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag. 3677 jcc(Assembler::zero, not_weak); 3678 // Resolve jweak. 3679 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 3680 value, Address(value, -JNIHandles::weak_tag_value), tmp, thread); 3681 verify_oop(value); 3682 jmp(done); 3683 bind(not_weak); 3684 // Resolve (untagged) jobject. 3685 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 3686 verify_oop(value); 3687 bind(done); 3688 } 3689 3690 void MacroAssembler::subptr(Register dst, int32_t imm32) { 3691 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32)); 3692 } 3693 3694 // Force generation of a 4 byte immediate value even if it fits into 8bit 3695 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) { 3696 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32)); 3697 } 3698 3699 void MacroAssembler::subptr(Register dst, Register src) { 3700 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); 3701 } 3702 3703 // C++ bool manipulation 3704 void MacroAssembler::testbool(Register dst) { 3705 if(sizeof(bool) == 1) 3706 testb(dst, 0xff); 3707 else if(sizeof(bool) == 2) { 3708 // testw implementation needed for two byte bools 3709 ShouldNotReachHere(); 3710 } else if(sizeof(bool) == 4) 3711 testl(dst, dst); 3712 else 3713 // unsupported 3714 ShouldNotReachHere(); 3715 } 3716 3717 void MacroAssembler::testptr(Register dst, Register src) { 3718 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src)); 3719 } 3720 3721 // Object / value buffer allocation... 3722 // 3723 // Kills klass and rsi on LP64 3724 void MacroAssembler::allocate_instance(Register klass, Register new_obj, 3725 Register t1, Register t2, 3726 bool clear_fields, Label& alloc_failed) 3727 { 3728 Label done, initialize_header, initialize_object, slow_case, slow_case_no_pop; 3729 Register layout_size = t1; 3730 assert(new_obj == rax, "needs to be rax, according to barrier asm eden_allocate"); 3731 assert_different_registers(klass, new_obj, t1, t2); 3732 3733 // get instance_size in InstanceKlass (scaled to a count of bytes) 3734 movl(layout_size, Address(klass, Klass::layout_helper_offset())); 3735 // test to see if it has a finalizer or is malformed in some way 3736 testl(layout_size, Klass::_lh_instance_slow_path_bit); 3737 jcc(Assembler::notZero, slow_case_no_pop); 3738 3739 // Allocate the instance: 3740 // If TLAB is enabled: 3741 // Try to allocate in the TLAB. 3742 // If fails, go to the slow path. 3743 // Else If inline contiguous allocations are enabled: 3744 // Try to allocate in eden. 3745 // If fails due to heap end, go to slow path. 3746 // 3747 // If TLAB is enabled OR inline contiguous is enabled: 3748 // Initialize the allocation. 3749 // Exit. 3750 // 3751 // Go to slow path. 3752 const bool allow_shared_alloc = 3753 Universe::heap()->supports_inline_contig_alloc(); 3754 3755 push(klass); 3756 const Register thread = LP64_ONLY(r15_thread) NOT_LP64(klass); 3757 #ifndef _LP64 3758 if (UseTLAB || allow_shared_alloc) { 3759 get_thread(thread); 3760 } 3761 #endif // _LP64 3762 3763 if (UseTLAB) { 3764 tlab_allocate(thread, new_obj, layout_size, 0, klass, t2, slow_case); 3765 if (ZeroTLAB || (!clear_fields)) { 3766 // the fields have been already cleared 3767 jmp(initialize_header); 3768 } else { 3769 // initialize both the header and fields 3770 jmp(initialize_object); 3771 } 3772 } else { 3773 // Allocation in the shared Eden, if allowed. 3774 // 3775 eden_allocate(thread, new_obj, layout_size, 0, t2, slow_case); 3776 } 3777 3778 // If UseTLAB or allow_shared_alloc are true, the object is created above and 3779 // there is an initialize need. Otherwise, skip and go to the slow path. 3780 if (UseTLAB || allow_shared_alloc) { 3781 if (clear_fields) { 3782 // The object is initialized before the header. If the object size is 3783 // zero, go directly to the header initialization. 3784 bind(initialize_object); 3785 decrement(layout_size, sizeof(oopDesc)); 3786 jcc(Assembler::zero, initialize_header); 3787 3788 // Initialize topmost object field, divide size by 8, check if odd and 3789 // test if zero. 3790 Register zero = klass; 3791 xorl(zero, zero); // use zero reg to clear memory (shorter code) 3792 shrl(layout_size, LogBytesPerLong); // divide by 2*oopSize and set carry flag if odd 3793 3794 #ifdef ASSERT 3795 // make sure instance_size was multiple of 8 3796 Label L; 3797 // Ignore partial flag stall after shrl() since it is debug VM 3798 jcc(Assembler::carryClear, L); 3799 stop("object size is not multiple of 2 - adjust this code"); 3800 bind(L); 3801 // must be > 0, no extra check needed here 3802 #endif 3803 3804 // initialize remaining object fields: instance_size was a multiple of 8 3805 { 3806 Label loop; 3807 bind(loop); 3808 movptr(Address(new_obj, layout_size, Address::times_8, sizeof(oopDesc) - 1*oopSize), zero); 3809 NOT_LP64(movptr(Address(new_obj, layout_size, Address::times_8, sizeof(oopDesc) - 2*oopSize), zero)); 3810 decrement(layout_size); 3811 jcc(Assembler::notZero, loop); 3812 } 3813 } // clear_fields 3814 3815 // initialize object header only. 3816 bind(initialize_header); 3817 pop(klass); 3818 Register mark_word = t2; 3819 movptr(mark_word, Address(klass, Klass::prototype_header_offset())); 3820 movptr(Address(new_obj, oopDesc::mark_offset_in_bytes ()), mark_word); 3821 #ifdef _LP64 3822 xorl(rsi, rsi); // use zero reg to clear memory (shorter code) 3823 store_klass_gap(new_obj, rsi); // zero klass gap for compressed oops 3824 #endif 3825 movptr(t2, klass); // preserve klass 3826 Register tmp_store_klass = LP64_ONLY(rscratch1) NOT_LP64(noreg); 3827 store_klass(new_obj, t2, tmp_store_klass); // src klass reg is potentially compressed 3828 3829 jmp(done); 3830 } 3831 3832 bind(slow_case); 3833 pop(klass); 3834 bind(slow_case_no_pop); 3835 jmp(alloc_failed); 3836 3837 bind(done); 3838 } 3839 3840 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 3841 void MacroAssembler::tlab_allocate(Register thread, Register obj, 3842 Register var_size_in_bytes, 3843 int con_size_in_bytes, 3844 Register t1, 3845 Register t2, 3846 Label& slow_case) { 3847 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3848 bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 3849 } 3850 3851 RegSet MacroAssembler::call_clobbered_gp_registers() { 3852 RegSet regs; 3853 #ifdef _LP64 3854 regs += RegSet::of(rax, rcx, rdx); 3855 #ifndef WINDOWS 3856 regs += RegSet::of(rsi, rdi); 3857 #endif 3858 regs += RegSet::range(r8, r11); 3859 #else 3860 regs += RegSet::of(rax, rcx, rdx); 3861 #endif 3862 return regs; 3863 } 3864 3865 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() { 3866 #if defined(WINDOWS) && defined(_LP64) 3867 XMMRegSet result = XMMRegSet::range(xmm0, xmm5); 3868 if (FrameMap::get_num_caller_save_xmms() > 16) { 3869 result += XMMRegSet::range(xmm16, as_XMMRegister(FrameMap::get_num_caller_save_xmms() - 1)); 3870 } 3871 return result; 3872 #else 3873 return XMMRegSet::range(xmm0, as_XMMRegister(FrameMap::get_num_caller_save_xmms() - 1)); 3874 #endif 3875 } 3876 3877 static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor 3878 3879 #ifndef _LP64 3880 static bool use_x87_registers() { return UseSSE < 2; } 3881 #endif 3882 static bool use_xmm_registers() { return UseSSE >= 1; } 3883 3884 // C1 only ever uses the first double/float of the XMM register. 3885 static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); } 3886 3887 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) { 3888 if (UseSSE == 1) { 3889 masm->movflt(Address(rsp, offset), reg); 3890 } else { 3891 masm->movdbl(Address(rsp, offset), reg); 3892 } 3893 } 3894 3895 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) { 3896 if (UseSSE == 1) { 3897 masm->movflt(reg, Address(rsp, offset)); 3898 } else { 3899 masm->movdbl(reg, Address(rsp, offset)); 3900 } 3901 } 3902 3903 int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers, bool save_fpu, 3904 int& gp_area_size, int& fp_area_size, int& xmm_area_size) { 3905 3906 gp_area_size = align_up(gp_registers.size() * RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size, 3907 StackAlignmentInBytes); 3908 #ifdef _LP64 3909 fp_area_size = 0; 3910 #else 3911 fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0; 3912 #endif 3913 xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0; 3914 3915 return gp_area_size + fp_area_size + xmm_area_size; 3916 } 3917 3918 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) { 3919 block_comment("push_call_clobbered_registers start"); 3920 // Regular registers 3921 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude; 3922 3923 int gp_area_size; 3924 int fp_area_size; 3925 int xmm_area_size; 3926 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu, 3927 gp_area_size, fp_area_size, xmm_area_size); 3928 subptr(rsp, total_save_size); 3929 3930 push_set(gp_registers_to_push, 0); 3931 3932 #ifndef _LP64 3933 if (save_fpu && use_x87_registers()) { 3934 fnsave(Address(rsp, gp_area_size)); 3935 fwait(); 3936 } 3937 #endif 3938 if (save_fpu && use_xmm_registers()) { 3939 push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size); 3940 } 3941 3942 block_comment("push_call_clobbered_registers end"); 3943 } 3944 3945 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) { 3946 block_comment("pop_call_clobbered_registers start"); 3947 3948 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude; 3949 3950 int gp_area_size; 3951 int fp_area_size; 3952 int xmm_area_size; 3953 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu, 3954 gp_area_size, fp_area_size, xmm_area_size); 3955 3956 if (restore_fpu && use_xmm_registers()) { 3957 pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size); 3958 } 3959 #ifndef _LP64 3960 if (restore_fpu && use_x87_registers()) { 3961 frstor(Address(rsp, gp_area_size)); 3962 } 3963 #endif 3964 3965 pop_set(gp_registers_to_pop, 0); 3966 3967 addptr(rsp, total_save_size); 3968 3969 vzeroupper(); 3970 3971 block_comment("pop_call_clobbered_registers end"); 3972 } 3973 3974 void MacroAssembler::push_set(XMMRegSet set, int offset) { 3975 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be"); 3976 int spill_offset = offset; 3977 3978 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) { 3979 save_xmm_register(this, spill_offset, *it); 3980 spill_offset += xmm_save_size(); 3981 } 3982 } 3983 3984 void MacroAssembler::pop_set(XMMRegSet set, int offset) { 3985 int restore_size = set.size() * xmm_save_size(); 3986 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be"); 3987 3988 int restore_offset = offset + restore_size - xmm_save_size(); 3989 3990 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) { 3991 restore_xmm_register(this, restore_offset, *it); 3992 restore_offset -= xmm_save_size(); 3993 } 3994 } 3995 3996 void MacroAssembler::push_set(RegSet set, int offset) { 3997 int spill_offset; 3998 if (offset == -1) { 3999 int register_push_size = set.size() * RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size; 4000 int aligned_size = align_up(register_push_size, StackAlignmentInBytes); 4001 subptr(rsp, aligned_size); 4002 spill_offset = 0; 4003 } else { 4004 spill_offset = offset; 4005 } 4006 4007 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) { 4008 movptr(Address(rsp, spill_offset), *it); 4009 spill_offset += RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size; 4010 } 4011 } 4012 4013 void MacroAssembler::pop_set(RegSet set, int offset) { 4014 4015 int gp_reg_size = RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size; 4016 int restore_size = set.size() * gp_reg_size; 4017 int aligned_size = align_up(restore_size, StackAlignmentInBytes); 4018 4019 int restore_offset; 4020 if (offset == -1) { 4021 restore_offset = restore_size - gp_reg_size; 4022 } else { 4023 restore_offset = offset + restore_size - gp_reg_size; 4024 } 4025 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) { 4026 movptr(*it, Address(rsp, restore_offset)); 4027 restore_offset -= gp_reg_size; 4028 } 4029 4030 if (offset == -1) { 4031 addptr(rsp, aligned_size); 4032 } 4033 } 4034 4035 // Defines obj, preserves var_size_in_bytes 4036 void MacroAssembler::eden_allocate(Register thread, Register obj, 4037 Register var_size_in_bytes, 4038 int con_size_in_bytes, 4039 Register t1, 4040 Label& slow_case) { 4041 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4042 bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4043 } 4044 4045 // Preserves the contents of address, destroys the contents length_in_bytes and temp. 4046 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) { 4047 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different"); 4048 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord"); 4049 Label done; 4050 4051 testptr(length_in_bytes, length_in_bytes); 4052 jcc(Assembler::zero, done); 4053 4054 // initialize topmost word, divide index by 2, check if odd and test if zero 4055 // note: for the remaining code to work, index must be a multiple of BytesPerWord 4056 #ifdef ASSERT 4057 { 4058 Label L; 4059 testptr(length_in_bytes, BytesPerWord - 1); 4060 jcc(Assembler::zero, L); 4061 stop("length must be a multiple of BytesPerWord"); 4062 bind(L); 4063 } 4064 #endif 4065 Register index = length_in_bytes; 4066 xorptr(temp, temp); // use _zero reg to clear memory (shorter code) 4067 if (UseIncDec) { 4068 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set 4069 } else { 4070 shrptr(index, 2); // use 2 instructions to avoid partial flag stall 4071 shrptr(index, 1); 4072 } 4073 #ifndef _LP64 4074 // index could have not been a multiple of 8 (i.e., bit 2 was set) 4075 { 4076 Label even; 4077 // note: if index was a multiple of 8, then it cannot 4078 // be 0 now otherwise it must have been 0 before 4079 // => if it is even, we don't need to check for 0 again 4080 jcc(Assembler::carryClear, even); 4081 // clear topmost word (no jump would be needed if conditional assignment worked here) 4082 movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp); 4083 // index could be 0 now, must check again 4084 jcc(Assembler::zero, done); 4085 bind(even); 4086 } 4087 #endif // !_LP64 4088 // initialize remaining object fields: index is a multiple of 2 now 4089 { 4090 Label loop; 4091 bind(loop); 4092 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp); 4093 NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);) 4094 decrement(index); 4095 jcc(Assembler::notZero, loop); 4096 } 4097 4098 bind(done); 4099 } 4100 4101 void MacroAssembler::get_inline_type_field_klass(Register klass, Register index, Register inline_klass) { 4102 movptr(inline_klass, Address(klass, InstanceKlass::inline_type_field_klasses_offset())); 4103 #ifdef ASSERT 4104 { 4105 Label done; 4106 cmpptr(inline_klass, 0); 4107 jcc(Assembler::notEqual, done); 4108 stop("get_inline_type_field_klass contains no inline klass"); 4109 bind(done); 4110 } 4111 #endif 4112 movptr(inline_klass, Address(inline_klass, index, Address::times_ptr)); 4113 } 4114 4115 void MacroAssembler::get_default_value_oop(Register inline_klass, Register temp_reg, Register obj) { 4116 #ifdef ASSERT 4117 { 4118 Label done_check; 4119 test_klass_is_inline_type(inline_klass, temp_reg, done_check); 4120 stop("get_default_value_oop from non inline type klass"); 4121 bind(done_check); 4122 } 4123 #endif 4124 Register offset = temp_reg; 4125 // Getting the offset of the pre-allocated default value 4126 movptr(offset, Address(inline_klass, in_bytes(InstanceKlass::adr_inlineklass_fixed_block_offset()))); 4127 movl(offset, Address(offset, in_bytes(InlineKlass::default_value_offset_offset()))); 4128 4129 // Getting the mirror 4130 movptr(obj, Address(inline_klass, in_bytes(Klass::java_mirror_offset()))); 4131 resolve_oop_handle(obj, inline_klass); 4132 4133 // Getting the pre-allocated default value from the mirror 4134 Address field(obj, offset, Address::times_1); 4135 load_heap_oop(obj, field); 4136 } 4137 4138 void MacroAssembler::get_empty_inline_type_oop(Register inline_klass, Register temp_reg, Register obj) { 4139 #ifdef ASSERT 4140 { 4141 Label done_check; 4142 test_klass_is_empty_inline_type(inline_klass, temp_reg, done_check); 4143 stop("get_empty_value from non-empty inline klass"); 4144 bind(done_check); 4145 } 4146 #endif 4147 get_default_value_oop(inline_klass, temp_reg, obj); 4148 } 4149 4150 4151 // Look up the method for a megamorphic invokeinterface call. 4152 // The target method is determined by <intf_klass, itable_index>. 4153 // The receiver klass is in recv_klass. 4154 // On success, the result will be in method_result, and execution falls through. 4155 // On failure, execution transfers to the given label. 4156 void MacroAssembler::lookup_interface_method(Register recv_klass, 4157 Register intf_klass, 4158 RegisterOrConstant itable_index, 4159 Register method_result, 4160 Register scan_temp, 4161 Label& L_no_such_interface, 4162 bool return_method) { 4163 assert_different_registers(recv_klass, intf_klass, scan_temp); 4164 assert_different_registers(method_result, intf_klass, scan_temp); 4165 assert(recv_klass != method_result || !return_method, 4166 "recv_klass can be destroyed when method isn't needed"); 4167 4168 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 4169 "caller must use same register for non-constant itable index as for method"); 4170 4171 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 4172 int vtable_base = in_bytes(Klass::vtable_start_offset()); 4173 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 4174 int scan_step = itableOffsetEntry::size() * wordSize; 4175 int vte_size = vtableEntry::size_in_bytes(); 4176 Address::ScaleFactor times_vte_scale = Address::times_ptr; 4177 assert(vte_size == wordSize, "else adjust times_vte_scale"); 4178 4179 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 4180 4181 // %%% Could store the aligned, prescaled offset in the klassoop. 4182 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 4183 4184 if (return_method) { 4185 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 4186 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 4187 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 4188 } 4189 4190 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 4191 // if (scan->interface() == intf) { 4192 // result = (klass + scan->offset() + itable_index); 4193 // } 4194 // } 4195 Label search, found_method; 4196 4197 for (int peel = 1; peel >= 0; peel--) { 4198 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 4199 cmpptr(intf_klass, method_result); 4200 4201 if (peel) { 4202 jccb(Assembler::equal, found_method); 4203 } else { 4204 jccb(Assembler::notEqual, search); 4205 // (invert the test to fall through to found_method...) 4206 } 4207 4208 if (!peel) break; 4209 4210 bind(search); 4211 4212 // Check that the previous entry is non-null. A null entry means that 4213 // the receiver class doesn't implement the interface, and wasn't the 4214 // same as when the caller was compiled. 4215 testptr(method_result, method_result); 4216 jcc(Assembler::zero, L_no_such_interface); 4217 addptr(scan_temp, scan_step); 4218 } 4219 4220 bind(found_method); 4221 4222 if (return_method) { 4223 // Got a hit. 4224 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 4225 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1)); 4226 } 4227 } 4228 4229 4230 // virtual method calling 4231 void MacroAssembler::lookup_virtual_method(Register recv_klass, 4232 RegisterOrConstant vtable_index, 4233 Register method_result) { 4234 const int base = in_bytes(Klass::vtable_start_offset()); 4235 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below"); 4236 Address vtable_entry_addr(recv_klass, 4237 vtable_index, Address::times_ptr, 4238 base + vtableEntry::method_offset_in_bytes()); 4239 movptr(method_result, vtable_entry_addr); 4240 } 4241 4242 4243 void MacroAssembler::check_klass_subtype(Register sub_klass, 4244 Register super_klass, 4245 Register temp_reg, 4246 Label& L_success) { 4247 Label L_failure; 4248 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 4249 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 4250 bind(L_failure); 4251 } 4252 4253 4254 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 4255 Register super_klass, 4256 Register temp_reg, 4257 Label* L_success, 4258 Label* L_failure, 4259 Label* L_slow_path, 4260 RegisterOrConstant super_check_offset) { 4261 assert_different_registers(sub_klass, super_klass, temp_reg); 4262 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 4263 if (super_check_offset.is_register()) { 4264 assert_different_registers(sub_klass, super_klass, 4265 super_check_offset.as_register()); 4266 } else if (must_load_sco) { 4267 assert(temp_reg != noreg, "supply either a temp or a register offset"); 4268 } 4269 4270 Label L_fallthrough; 4271 int label_nulls = 0; 4272 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 4273 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 4274 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 4275 assert(label_nulls <= 1, "at most one NULL in the batch"); 4276 4277 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 4278 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 4279 Address super_check_offset_addr(super_klass, sco_offset); 4280 4281 // Hacked jcc, which "knows" that L_fallthrough, at least, is in 4282 // range of a jccb. If this routine grows larger, reconsider at 4283 // least some of these. 4284 #define local_jcc(assembler_cond, label) \ 4285 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \ 4286 else jcc( assembler_cond, label) /*omit semi*/ 4287 4288 // Hacked jmp, which may only be used just before L_fallthrough. 4289 #define final_jmp(label) \ 4290 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 4291 else jmp(label) /*omit semi*/ 4292 4293 // If the pointers are equal, we are done (e.g., String[] elements). 4294 // This self-check enables sharing of secondary supertype arrays among 4295 // non-primary types such as array-of-interface. Otherwise, each such 4296 // type would need its own customized SSA. 4297 // We move this check to the front of the fast path because many 4298 // type checks are in fact trivially successful in this manner, 4299 // so we get a nicely predicted branch right at the start of the check. 4300 cmpptr(sub_klass, super_klass); 4301 local_jcc(Assembler::equal, *L_success); 4302 4303 // Check the supertype display: 4304 if (must_load_sco) { 4305 // Positive movl does right thing on LP64. 4306 movl(temp_reg, super_check_offset_addr); 4307 super_check_offset = RegisterOrConstant(temp_reg); 4308 } 4309 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0); 4310 cmpptr(super_klass, super_check_addr); // load displayed supertype 4311 4312 // This check has worked decisively for primary supers. 4313 // Secondary supers are sought in the super_cache ('super_cache_addr'). 4314 // (Secondary supers are interfaces and very deeply nested subtypes.) 4315 // This works in the same check above because of a tricky aliasing 4316 // between the super_cache and the primary super display elements. 4317 // (The 'super_check_addr' can address either, as the case requires.) 4318 // Note that the cache is updated below if it does not help us find 4319 // what we need immediately. 4320 // So if it was a primary super, we can just fail immediately. 4321 // Otherwise, it's the slow path for us (no success at this point). 4322 4323 if (super_check_offset.is_register()) { 4324 local_jcc(Assembler::equal, *L_success); 4325 cmpl(super_check_offset.as_register(), sc_offset); 4326 if (L_failure == &L_fallthrough) { 4327 local_jcc(Assembler::equal, *L_slow_path); 4328 } else { 4329 local_jcc(Assembler::notEqual, *L_failure); 4330 final_jmp(*L_slow_path); 4331 } 4332 } else if (super_check_offset.as_constant() == sc_offset) { 4333 // Need a slow path; fast failure is impossible. 4334 if (L_slow_path == &L_fallthrough) { 4335 local_jcc(Assembler::equal, *L_success); 4336 } else { 4337 local_jcc(Assembler::notEqual, *L_slow_path); 4338 final_jmp(*L_success); 4339 } 4340 } else { 4341 // No slow path; it's a fast decision. 4342 if (L_failure == &L_fallthrough) { 4343 local_jcc(Assembler::equal, *L_success); 4344 } else { 4345 local_jcc(Assembler::notEqual, *L_failure); 4346 final_jmp(*L_success); 4347 } 4348 } 4349 4350 bind(L_fallthrough); 4351 4352 #undef local_jcc 4353 #undef final_jmp 4354 } 4355 4356 4357 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 4358 Register super_klass, 4359 Register temp_reg, 4360 Register temp2_reg, 4361 Label* L_success, 4362 Label* L_failure, 4363 bool set_cond_codes) { 4364 assert_different_registers(sub_klass, super_klass, temp_reg); 4365 if (temp2_reg != noreg) 4366 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg); 4367 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 4368 4369 Label L_fallthrough; 4370 int label_nulls = 0; 4371 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 4372 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 4373 assert(label_nulls <= 1, "at most one NULL in the batch"); 4374 4375 // a couple of useful fields in sub_klass: 4376 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 4377 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 4378 Address secondary_supers_addr(sub_klass, ss_offset); 4379 Address super_cache_addr( sub_klass, sc_offset); 4380 4381 // Do a linear scan of the secondary super-klass chain. 4382 // This code is rarely used, so simplicity is a virtue here. 4383 // The repne_scan instruction uses fixed registers, which we must spill. 4384 // Don't worry too much about pre-existing connections with the input regs. 4385 4386 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super) 4387 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter) 4388 4389 // Get super_klass value into rax (even if it was in rdi or rcx). 4390 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false; 4391 if (super_klass != rax || UseCompressedOops) { 4392 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; } 4393 mov(rax, super_klass); 4394 } 4395 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; } 4396 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; } 4397 4398 #ifndef PRODUCT 4399 int* pst_counter = &SharedRuntime::_partial_subtype_ctr; 4400 ExternalAddress pst_counter_addr((address) pst_counter); 4401 NOT_LP64( incrementl(pst_counter_addr) ); 4402 LP64_ONLY( lea(rcx, pst_counter_addr) ); 4403 LP64_ONLY( incrementl(Address(rcx, 0)) ); 4404 #endif //PRODUCT 4405 4406 // We will consult the secondary-super array. 4407 movptr(rdi, secondary_supers_addr); 4408 // Load the array length. (Positive movl does right thing on LP64.) 4409 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes())); 4410 // Skip to start of data. 4411 addptr(rdi, Array<Klass*>::base_offset_in_bytes()); 4412 4413 // Scan RCX words at [RDI] for an occurrence of RAX. 4414 // Set NZ/Z based on last compare. 4415 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does 4416 // not change flags (only scas instruction which is repeated sets flags). 4417 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. 4418 4419 testptr(rax,rax); // Set Z = 0 4420 repne_scan(); 4421 4422 // Unspill the temp. registers: 4423 if (pushed_rdi) pop(rdi); 4424 if (pushed_rcx) pop(rcx); 4425 if (pushed_rax) pop(rax); 4426 4427 if (set_cond_codes) { 4428 // Special hack for the AD files: rdi is guaranteed non-zero. 4429 assert(!pushed_rdi, "rdi must be left non-NULL"); 4430 // Also, the condition codes are properly set Z/NZ on succeed/failure. 4431 } 4432 4433 if (L_failure == &L_fallthrough) 4434 jccb(Assembler::notEqual, *L_failure); 4435 else jcc(Assembler::notEqual, *L_failure); 4436 4437 // Success. Cache the super we found and proceed in triumph. 4438 movptr(super_cache_addr, super_klass); 4439 4440 if (L_success != &L_fallthrough) { 4441 jmp(*L_success); 4442 } 4443 4444 #undef IS_A_TEMP 4445 4446 bind(L_fallthrough); 4447 } 4448 4449 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 4450 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 4451 4452 Label L_fallthrough; 4453 if (L_fast_path == NULL) { 4454 L_fast_path = &L_fallthrough; 4455 } else if (L_slow_path == NULL) { 4456 L_slow_path = &L_fallthrough; 4457 } 4458 4459 // Fast path check: class is fully initialized 4460 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized); 4461 jcc(Assembler::equal, *L_fast_path); 4462 4463 // Fast path check: current thread is initializer thread 4464 cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset())); 4465 if (L_slow_path == &L_fallthrough) { 4466 jcc(Assembler::equal, *L_fast_path); 4467 bind(*L_slow_path); 4468 } else if (L_fast_path == &L_fallthrough) { 4469 jcc(Assembler::notEqual, *L_slow_path); 4470 bind(*L_fast_path); 4471 } else { 4472 Unimplemented(); 4473 } 4474 } 4475 4476 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) { 4477 if (VM_Version::supports_cmov()) { 4478 cmovl(cc, dst, src); 4479 } else { 4480 Label L; 4481 jccb(negate_condition(cc), L); 4482 movl(dst, src); 4483 bind(L); 4484 } 4485 } 4486 4487 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) { 4488 if (VM_Version::supports_cmov()) { 4489 cmovl(cc, dst, src); 4490 } else { 4491 Label L; 4492 jccb(negate_condition(cc), L); 4493 movl(dst, src); 4494 bind(L); 4495 } 4496 } 4497 4498 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) { 4499 if (!VerifyOops || VerifyAdapterSharing) { 4500 // Below address of the code string confuses VerifyAdapterSharing 4501 // because it may differ between otherwise equivalent adapters. 4502 return; 4503 } 4504 4505 // Pass register number to verify_oop_subroutine 4506 const char* b = NULL; 4507 { 4508 ResourceMark rm; 4509 stringStream ss; 4510 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line); 4511 b = code_string(ss.as_string()); 4512 } 4513 BLOCK_COMMENT("verify_oop {"); 4514 #ifdef _LP64 4515 push(rscratch1); // save r10, trashed by movptr() 4516 #endif 4517 push(rax); // save rax, 4518 push(reg); // pass register argument 4519 ExternalAddress buffer((address) b); 4520 // avoid using pushptr, as it modifies scratch registers 4521 // and our contract is not to modify anything 4522 movptr(rax, buffer.addr()); 4523 push(rax); 4524 // call indirectly to solve generation ordering problem 4525 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 4526 call(rax); 4527 // Caller pops the arguments (oop, message) and restores rax, r10 4528 BLOCK_COMMENT("} verify_oop"); 4529 } 4530 4531 void MacroAssembler::vallones(XMMRegister dst, int vector_len) { 4532 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 4533 vpternlogd(dst, 0xFF, dst, dst, vector_len); 4534 } else { 4535 assert(UseAVX > 0, ""); 4536 vpcmpeqb(dst, dst, dst, vector_len); 4537 } 4538 } 4539 4540 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 4541 int extra_slot_offset) { 4542 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 4543 int stackElementSize = Interpreter::stackElementSize; 4544 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 4545 #ifdef ASSERT 4546 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 4547 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 4548 #endif 4549 Register scale_reg = noreg; 4550 Address::ScaleFactor scale_factor = Address::no_scale; 4551 if (arg_slot.is_constant()) { 4552 offset += arg_slot.as_constant() * stackElementSize; 4553 } else { 4554 scale_reg = arg_slot.as_register(); 4555 scale_factor = Address::times(stackElementSize); 4556 } 4557 offset += wordSize; // return PC is on stack 4558 return Address(rsp, scale_reg, scale_factor, offset); 4559 } 4560 4561 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) { 4562 if (!VerifyOops || VerifyAdapterSharing) { 4563 // Below address of the code string confuses VerifyAdapterSharing 4564 // because it may differ between otherwise equivalent adapters. 4565 return; 4566 } 4567 4568 // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord); 4569 // Pass register number to verify_oop_subroutine 4570 const char* b = NULL; 4571 { 4572 ResourceMark rm; 4573 stringStream ss; 4574 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line); 4575 b = code_string(ss.as_string()); 4576 } 4577 #ifdef _LP64 4578 push(rscratch1); // save r10, trashed by movptr() 4579 #endif 4580 push(rax); // save rax, 4581 // addr may contain rsp so we will have to adjust it based on the push 4582 // we just did (and on 64 bit we do two pushes) 4583 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which 4584 // stores rax into addr which is backwards of what was intended. 4585 if (addr.uses(rsp)) { 4586 lea(rax, addr); 4587 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord)); 4588 } else { 4589 pushptr(addr); 4590 } 4591 4592 ExternalAddress buffer((address) b); 4593 // pass msg argument 4594 // avoid using pushptr, as it modifies scratch registers 4595 // and our contract is not to modify anything 4596 movptr(rax, buffer.addr()); 4597 push(rax); 4598 4599 // call indirectly to solve generation ordering problem 4600 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 4601 call(rax); 4602 // Caller pops the arguments (addr, message) and restores rax, r10. 4603 } 4604 4605 void MacroAssembler::verify_tlab() { 4606 #ifdef ASSERT 4607 if (UseTLAB && VerifyOops) { 4608 Label next, ok; 4609 Register t1 = rsi; 4610 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread); 4611 4612 push(t1); 4613 NOT_LP64(push(thread_reg)); 4614 NOT_LP64(get_thread(thread_reg)); 4615 4616 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 4617 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset()))); 4618 jcc(Assembler::aboveEqual, next); 4619 STOP("assert(top >= start)"); 4620 should_not_reach_here(); 4621 4622 bind(next); 4623 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset()))); 4624 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 4625 jcc(Assembler::aboveEqual, ok); 4626 STOP("assert(top <= end)"); 4627 should_not_reach_here(); 4628 4629 bind(ok); 4630 NOT_LP64(pop(thread_reg)); 4631 pop(t1); 4632 } 4633 #endif 4634 } 4635 4636 class ControlWord { 4637 public: 4638 int32_t _value; 4639 4640 int rounding_control() const { return (_value >> 10) & 3 ; } 4641 int precision_control() const { return (_value >> 8) & 3 ; } 4642 bool precision() const { return ((_value >> 5) & 1) != 0; } 4643 bool underflow() const { return ((_value >> 4) & 1) != 0; } 4644 bool overflow() const { return ((_value >> 3) & 1) != 0; } 4645 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 4646 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 4647 bool invalid() const { return ((_value >> 0) & 1) != 0; } 4648 4649 void print() const { 4650 // rounding control 4651 const char* rc; 4652 switch (rounding_control()) { 4653 case 0: rc = "round near"; break; 4654 case 1: rc = "round down"; break; 4655 case 2: rc = "round up "; break; 4656 case 3: rc = "chop "; break; 4657 default: 4658 rc = NULL; // silence compiler warnings 4659 fatal("Unknown rounding control: %d", rounding_control()); 4660 }; 4661 // precision control 4662 const char* pc; 4663 switch (precision_control()) { 4664 case 0: pc = "24 bits "; break; 4665 case 1: pc = "reserved"; break; 4666 case 2: pc = "53 bits "; break; 4667 case 3: pc = "64 bits "; break; 4668 default: 4669 pc = NULL; // silence compiler warnings 4670 fatal("Unknown precision control: %d", precision_control()); 4671 }; 4672 // flags 4673 char f[9]; 4674 f[0] = ' '; 4675 f[1] = ' '; 4676 f[2] = (precision ()) ? 'P' : 'p'; 4677 f[3] = (underflow ()) ? 'U' : 'u'; 4678 f[4] = (overflow ()) ? 'O' : 'o'; 4679 f[5] = (zero_divide ()) ? 'Z' : 'z'; 4680 f[6] = (denormalized()) ? 'D' : 'd'; 4681 f[7] = (invalid ()) ? 'I' : 'i'; 4682 f[8] = '\x0'; 4683 // output 4684 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc); 4685 } 4686 4687 }; 4688 4689 class StatusWord { 4690 public: 4691 int32_t _value; 4692 4693 bool busy() const { return ((_value >> 15) & 1) != 0; } 4694 bool C3() const { return ((_value >> 14) & 1) != 0; } 4695 bool C2() const { return ((_value >> 10) & 1) != 0; } 4696 bool C1() const { return ((_value >> 9) & 1) != 0; } 4697 bool C0() const { return ((_value >> 8) & 1) != 0; } 4698 int top() const { return (_value >> 11) & 7 ; } 4699 bool error_status() const { return ((_value >> 7) & 1) != 0; } 4700 bool stack_fault() const { return ((_value >> 6) & 1) != 0; } 4701 bool precision() const { return ((_value >> 5) & 1) != 0; } 4702 bool underflow() const { return ((_value >> 4) & 1) != 0; } 4703 bool overflow() const { return ((_value >> 3) & 1) != 0; } 4704 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 4705 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 4706 bool invalid() const { return ((_value >> 0) & 1) != 0; } 4707 4708 void print() const { 4709 // condition codes 4710 char c[5]; 4711 c[0] = (C3()) ? '3' : '-'; 4712 c[1] = (C2()) ? '2' : '-'; 4713 c[2] = (C1()) ? '1' : '-'; 4714 c[3] = (C0()) ? '0' : '-'; 4715 c[4] = '\x0'; 4716 // flags 4717 char f[9]; 4718 f[0] = (error_status()) ? 'E' : '-'; 4719 f[1] = (stack_fault ()) ? 'S' : '-'; 4720 f[2] = (precision ()) ? 'P' : '-'; 4721 f[3] = (underflow ()) ? 'U' : '-'; 4722 f[4] = (overflow ()) ? 'O' : '-'; 4723 f[5] = (zero_divide ()) ? 'Z' : '-'; 4724 f[6] = (denormalized()) ? 'D' : '-'; 4725 f[7] = (invalid ()) ? 'I' : '-'; 4726 f[8] = '\x0'; 4727 // output 4728 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top()); 4729 } 4730 4731 }; 4732 4733 class TagWord { 4734 public: 4735 int32_t _value; 4736 4737 int tag_at(int i) const { return (_value >> (i*2)) & 3; } 4738 4739 void print() const { 4740 printf("%04x", _value & 0xFFFF); 4741 } 4742 4743 }; 4744 4745 class FPU_Register { 4746 public: 4747 int32_t _m0; 4748 int32_t _m1; 4749 int16_t _ex; 4750 4751 bool is_indefinite() const { 4752 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0; 4753 } 4754 4755 void print() const { 4756 char sign = (_ex < 0) ? '-' : '+'; 4757 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " "; 4758 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind); 4759 }; 4760 4761 }; 4762 4763 class FPU_State { 4764 public: 4765 enum { 4766 register_size = 10, 4767 number_of_registers = 8, 4768 register_mask = 7 4769 }; 4770 4771 ControlWord _control_word; 4772 StatusWord _status_word; 4773 TagWord _tag_word; 4774 int32_t _error_offset; 4775 int32_t _error_selector; 4776 int32_t _data_offset; 4777 int32_t _data_selector; 4778 int8_t _register[register_size * number_of_registers]; 4779 4780 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); } 4781 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; } 4782 4783 const char* tag_as_string(int tag) const { 4784 switch (tag) { 4785 case 0: return "valid"; 4786 case 1: return "zero"; 4787 case 2: return "special"; 4788 case 3: return "empty"; 4789 } 4790 ShouldNotReachHere(); 4791 return NULL; 4792 } 4793 4794 void print() const { 4795 // print computation registers 4796 { int t = _status_word.top(); 4797 for (int i = 0; i < number_of_registers; i++) { 4798 int j = (i - t) & register_mask; 4799 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j); 4800 st(j)->print(); 4801 printf(" %s\n", tag_as_string(_tag_word.tag_at(i))); 4802 } 4803 } 4804 printf("\n"); 4805 // print control registers 4806 printf("ctrl = "); _control_word.print(); printf("\n"); 4807 printf("stat = "); _status_word .print(); printf("\n"); 4808 printf("tags = "); _tag_word .print(); printf("\n"); 4809 } 4810 4811 }; 4812 4813 class Flag_Register { 4814 public: 4815 int32_t _value; 4816 4817 bool overflow() const { return ((_value >> 11) & 1) != 0; } 4818 bool direction() const { return ((_value >> 10) & 1) != 0; } 4819 bool sign() const { return ((_value >> 7) & 1) != 0; } 4820 bool zero() const { return ((_value >> 6) & 1) != 0; } 4821 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; } 4822 bool parity() const { return ((_value >> 2) & 1) != 0; } 4823 bool carry() const { return ((_value >> 0) & 1) != 0; } 4824 4825 void print() const { 4826 // flags 4827 char f[8]; 4828 f[0] = (overflow ()) ? 'O' : '-'; 4829 f[1] = (direction ()) ? 'D' : '-'; 4830 f[2] = (sign ()) ? 'S' : '-'; 4831 f[3] = (zero ()) ? 'Z' : '-'; 4832 f[4] = (auxiliary_carry()) ? 'A' : '-'; 4833 f[5] = (parity ()) ? 'P' : '-'; 4834 f[6] = (carry ()) ? 'C' : '-'; 4835 f[7] = '\x0'; 4836 // output 4837 printf("%08x flags = %s", _value, f); 4838 } 4839 4840 }; 4841 4842 class IU_Register { 4843 public: 4844 int32_t _value; 4845 4846 void print() const { 4847 printf("%08x %11d", _value, _value); 4848 } 4849 4850 }; 4851 4852 class IU_State { 4853 public: 4854 Flag_Register _eflags; 4855 IU_Register _rdi; 4856 IU_Register _rsi; 4857 IU_Register _rbp; 4858 IU_Register _rsp; 4859 IU_Register _rbx; 4860 IU_Register _rdx; 4861 IU_Register _rcx; 4862 IU_Register _rax; 4863 4864 void print() const { 4865 // computation registers 4866 printf("rax, = "); _rax.print(); printf("\n"); 4867 printf("rbx, = "); _rbx.print(); printf("\n"); 4868 printf("rcx = "); _rcx.print(); printf("\n"); 4869 printf("rdx = "); _rdx.print(); printf("\n"); 4870 printf("rdi = "); _rdi.print(); printf("\n"); 4871 printf("rsi = "); _rsi.print(); printf("\n"); 4872 printf("rbp, = "); _rbp.print(); printf("\n"); 4873 printf("rsp = "); _rsp.print(); printf("\n"); 4874 printf("\n"); 4875 // control registers 4876 printf("flgs = "); _eflags.print(); printf("\n"); 4877 } 4878 }; 4879 4880 4881 class CPU_State { 4882 public: 4883 FPU_State _fpu_state; 4884 IU_State _iu_state; 4885 4886 void print() const { 4887 printf("--------------------------------------------------\n"); 4888 _iu_state .print(); 4889 printf("\n"); 4890 _fpu_state.print(); 4891 printf("--------------------------------------------------\n"); 4892 } 4893 4894 }; 4895 4896 4897 static void _print_CPU_state(CPU_State* state) { 4898 state->print(); 4899 }; 4900 4901 4902 void MacroAssembler::print_CPU_state() { 4903 push_CPU_state(); 4904 push(rsp); // pass CPU state 4905 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state))); 4906 addptr(rsp, wordSize); // discard argument 4907 pop_CPU_state(); 4908 } 4909 4910 4911 #ifndef _LP64 4912 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) { 4913 static int counter = 0; 4914 FPU_State* fs = &state->_fpu_state; 4915 counter++; 4916 // For leaf calls, only verify that the top few elements remain empty. 4917 // We only need 1 empty at the top for C2 code. 4918 if( stack_depth < 0 ) { 4919 if( fs->tag_for_st(7) != 3 ) { 4920 printf("FPR7 not empty\n"); 4921 state->print(); 4922 assert(false, "error"); 4923 return false; 4924 } 4925 return true; // All other stack states do not matter 4926 } 4927 4928 assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(), 4929 "bad FPU control word"); 4930 4931 // compute stack depth 4932 int i = 0; 4933 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++; 4934 int d = i; 4935 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++; 4936 // verify findings 4937 if (i != FPU_State::number_of_registers) { 4938 // stack not contiguous 4939 printf("%s: stack not contiguous at ST%d\n", s, i); 4940 state->print(); 4941 assert(false, "error"); 4942 return false; 4943 } 4944 // check if computed stack depth corresponds to expected stack depth 4945 if (stack_depth < 0) { 4946 // expected stack depth is -stack_depth or less 4947 if (d > -stack_depth) { 4948 // too many elements on the stack 4949 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d); 4950 state->print(); 4951 assert(false, "error"); 4952 return false; 4953 } 4954 } else { 4955 // expected stack depth is stack_depth 4956 if (d != stack_depth) { 4957 // wrong stack depth 4958 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d); 4959 state->print(); 4960 assert(false, "error"); 4961 return false; 4962 } 4963 } 4964 // everything is cool 4965 return true; 4966 } 4967 4968 void MacroAssembler::verify_FPU(int stack_depth, const char* s) { 4969 if (!VerifyFPU) return; 4970 push_CPU_state(); 4971 push(rsp); // pass CPU state 4972 ExternalAddress msg((address) s); 4973 // pass message string s 4974 pushptr(msg.addr()); 4975 push(stack_depth); // pass stack depth 4976 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU))); 4977 addptr(rsp, 3 * wordSize); // discard arguments 4978 // check for error 4979 { Label L; 4980 testl(rax, rax); 4981 jcc(Assembler::notZero, L); 4982 int3(); // break if error condition 4983 bind(L); 4984 } 4985 pop_CPU_state(); 4986 } 4987 #endif // _LP64 4988 4989 void MacroAssembler::restore_cpu_control_state_after_jni() { 4990 // Either restore the MXCSR register after returning from the JNI Call 4991 // or verify that it wasn't changed (with -Xcheck:jni flag). 4992 if (VM_Version::supports_sse()) { 4993 if (RestoreMXCSROnJNICalls) { 4994 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std())); 4995 } else if (CheckJNICalls) { 4996 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); 4997 } 4998 } 4999 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. 5000 vzeroupper(); 5001 // Reset k1 to 0xffff. 5002 5003 #ifdef COMPILER2 5004 if (PostLoopMultiversioning && VM_Version::supports_evex()) { 5005 push(rcx); 5006 movl(rcx, 0xffff); 5007 kmovwl(k1, rcx); 5008 pop(rcx); 5009 } 5010 #endif // COMPILER2 5011 5012 #ifndef _LP64 5013 // Either restore the x87 floating pointer control word after returning 5014 // from the JNI call or verify that it wasn't changed. 5015 if (CheckJNICalls) { 5016 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); 5017 } 5018 #endif // _LP64 5019 } 5020 5021 // ((OopHandle)result).resolve(); 5022 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 5023 assert_different_registers(result, tmp); 5024 5025 // Only 64 bit platforms support GCs that require a tmp register 5026 // Only IN_HEAP loads require a thread_tmp register 5027 // OopHandle::resolve is an indirection like jobject. 5028 access_load_at(T_OBJECT, IN_NATIVE, 5029 result, Address(result, 0), tmp, /*tmp_thread*/noreg); 5030 } 5031 5032 // ((WeakHandle)result).resolve(); 5033 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) { 5034 assert_different_registers(rresult, rtmp); 5035 Label resolved; 5036 5037 // A null weak handle resolves to null. 5038 cmpptr(rresult, 0); 5039 jcc(Assembler::equal, resolved); 5040 5041 // Only 64 bit platforms support GCs that require a tmp register 5042 // Only IN_HEAP loads require a thread_tmp register 5043 // WeakHandle::resolve is an indirection like jweak. 5044 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 5045 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg); 5046 bind(resolved); 5047 } 5048 5049 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) { 5050 // get mirror 5051 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 5052 load_method_holder(mirror, method); 5053 movptr(mirror, Address(mirror, mirror_offset)); 5054 resolve_oop_handle(mirror, tmp); 5055 } 5056 5057 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) { 5058 load_method_holder(rresult, rmethod); 5059 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset())); 5060 } 5061 5062 void MacroAssembler::load_method_holder(Register holder, Register method) { 5063 movptr(holder, Address(method, Method::const_offset())); // ConstMethod* 5064 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool* 5065 movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass* 5066 } 5067 5068 void MacroAssembler::load_metadata(Register dst, Register src) { 5069 if (UseCompressedClassPointers) { 5070 movl(dst, Address(src, oopDesc::klass_offset_in_bytes())); 5071 } else { 5072 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 5073 } 5074 } 5075 5076 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) { 5077 assert_different_registers(src, tmp); 5078 assert_different_registers(dst, tmp); 5079 #ifdef _LP64 5080 if (UseCompressedClassPointers) { 5081 movl(dst, Address(src, oopDesc::klass_offset_in_bytes())); 5082 decode_klass_not_null(dst, tmp); 5083 } else 5084 #endif 5085 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 5086 } 5087 5088 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) { 5089 load_klass(dst, src, tmp); 5090 movptr(dst, Address(dst, Klass::prototype_header_offset())); 5091 } 5092 5093 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) { 5094 assert_different_registers(src, tmp); 5095 assert_different_registers(dst, tmp); 5096 #ifdef _LP64 5097 if (UseCompressedClassPointers) { 5098 encode_klass_not_null(src, tmp); 5099 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src); 5100 } else 5101 #endif 5102 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src); 5103 } 5104 5105 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src, 5106 Register tmp1, Register thread_tmp) { 5107 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 5108 decorators = AccessInternal::decorator_fixup(decorators); 5109 bool as_raw = (decorators & AS_RAW) != 0; 5110 if (as_raw) { 5111 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 5112 } else { 5113 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 5114 } 5115 } 5116 5117 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src, 5118 Register tmp1, Register tmp2, Register tmp3) { 5119 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 5120 decorators = AccessInternal::decorator_fixup(decorators); 5121 bool as_raw = (decorators & AS_RAW) != 0; 5122 if (as_raw) { 5123 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3); 5124 } else { 5125 bs->store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3); 5126 } 5127 } 5128 5129 void MacroAssembler::access_value_copy(DecoratorSet decorators, Register src, Register dst, 5130 Register inline_klass) { 5131 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 5132 bs->value_copy(this, decorators, src, dst, inline_klass); 5133 } 5134 5135 void MacroAssembler::first_field_offset(Register inline_klass, Register offset) { 5136 movptr(offset, Address(inline_klass, InstanceKlass::adr_inlineklass_fixed_block_offset())); 5137 movl(offset, Address(offset, InlineKlass::first_field_offset_offset())); 5138 } 5139 5140 void MacroAssembler::data_for_oop(Register oop, Register data, Register inline_klass) { 5141 // ((address) (void*) o) + vk->first_field_offset(); 5142 Register offset = (data == oop) ? rscratch1 : data; 5143 first_field_offset(inline_klass, offset); 5144 if (data == oop) { 5145 addptr(data, offset); 5146 } else { 5147 lea(data, Address(oop, offset)); 5148 } 5149 } 5150 5151 void MacroAssembler::data_for_value_array_index(Register array, Register array_klass, 5152 Register index, Register data) { 5153 assert(index != rcx, "index needs to shift by rcx"); 5154 assert_different_registers(array, array_klass, index); 5155 assert_different_registers(rcx, array, index); 5156 5157 // array->base() + (index << Klass::layout_helper_log2_element_size(lh)); 5158 movl(rcx, Address(array_klass, Klass::layout_helper_offset())); 5159 5160 // Klass::layout_helper_log2_element_size(lh) 5161 // (lh >> _lh_log2_element_size_shift) & _lh_log2_element_size_mask; 5162 shrl(rcx, Klass::_lh_log2_element_size_shift); 5163 andl(rcx, Klass::_lh_log2_element_size_mask); 5164 shlptr(index); // index << rcx 5165 5166 lea(data, Address(array, index, Address::times_1, arrayOopDesc::base_offset_in_bytes(T_PRIMITIVE_OBJECT))); 5167 } 5168 5169 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 5170 Register thread_tmp, DecoratorSet decorators) { 5171 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 5172 } 5173 5174 // Doesn't do verfication, generates fixed size code 5175 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 5176 Register thread_tmp, DecoratorSet decorators) { 5177 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 5178 } 5179 5180 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 5181 Register tmp2, Register tmp3, DecoratorSet decorators) { 5182 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2, tmp3); 5183 } 5184 5185 // Used for storing NULLs. 5186 void MacroAssembler::store_heap_oop_null(Address dst) { 5187 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg); 5188 } 5189 5190 #ifdef _LP64 5191 void MacroAssembler::store_klass_gap(Register dst, Register src) { 5192 if (UseCompressedClassPointers) { 5193 // Store to klass gap in destination 5194 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src); 5195 } 5196 } 5197 5198 #ifdef ASSERT 5199 void MacroAssembler::verify_heapbase(const char* msg) { 5200 assert (UseCompressedOops, "should be compressed"); 5201 assert (Universe::heap() != NULL, "java heap should be initialized"); 5202 if (CheckCompressedOops) { 5203 Label ok; 5204 const auto src2 = ExternalAddress((address)CompressedOops::ptrs_base_addr()); 5205 assert(!src2.is_lval(), "should not be lval"); 5206 const bool is_src2_reachable = reachable(src2); 5207 if (!is_src2_reachable) { 5208 push(rscratch1); // cmpptr trashes rscratch1 5209 } 5210 cmpptr(r12_heapbase, src2); 5211 jcc(Assembler::equal, ok); 5212 STOP(msg); 5213 bind(ok); 5214 if (!is_src2_reachable) { 5215 pop(rscratch1); 5216 } 5217 } 5218 } 5219 #endif 5220 5221 // Algorithm must match oop.inline.hpp encode_heap_oop. 5222 void MacroAssembler::encode_heap_oop(Register r) { 5223 #ifdef ASSERT 5224 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 5225 #endif 5226 verify_oop_msg(r, "broken oop in encode_heap_oop"); 5227 if (CompressedOops::base() == NULL) { 5228 if (CompressedOops::shift() != 0) { 5229 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5230 shrq(r, LogMinObjAlignmentInBytes); 5231 } 5232 return; 5233 } 5234 testq(r, r); 5235 cmovq(Assembler::equal, r, r12_heapbase); 5236 subq(r, r12_heapbase); 5237 shrq(r, LogMinObjAlignmentInBytes); 5238 } 5239 5240 void MacroAssembler::encode_heap_oop_not_null(Register r) { 5241 #ifdef ASSERT 5242 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 5243 if (CheckCompressedOops) { 5244 Label ok; 5245 testq(r, r); 5246 jcc(Assembler::notEqual, ok); 5247 STOP("null oop passed to encode_heap_oop_not_null"); 5248 bind(ok); 5249 } 5250 #endif 5251 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null"); 5252 if (CompressedOops::base() != NULL) { 5253 subq(r, r12_heapbase); 5254 } 5255 if (CompressedOops::shift() != 0) { 5256 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5257 shrq(r, LogMinObjAlignmentInBytes); 5258 } 5259 } 5260 5261 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 5262 #ifdef ASSERT 5263 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 5264 if (CheckCompressedOops) { 5265 Label ok; 5266 testq(src, src); 5267 jcc(Assembler::notEqual, ok); 5268 STOP("null oop passed to encode_heap_oop_not_null2"); 5269 bind(ok); 5270 } 5271 #endif 5272 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2"); 5273 if (dst != src) { 5274 movq(dst, src); 5275 } 5276 if (CompressedOops::base() != NULL) { 5277 subq(dst, r12_heapbase); 5278 } 5279 if (CompressedOops::shift() != 0) { 5280 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5281 shrq(dst, LogMinObjAlignmentInBytes); 5282 } 5283 } 5284 5285 void MacroAssembler::decode_heap_oop(Register r) { 5286 #ifdef ASSERT 5287 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 5288 #endif 5289 if (CompressedOops::base() == NULL) { 5290 if (CompressedOops::shift() != 0) { 5291 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5292 shlq(r, LogMinObjAlignmentInBytes); 5293 } 5294 } else { 5295 Label done; 5296 shlq(r, LogMinObjAlignmentInBytes); 5297 jccb(Assembler::equal, done); 5298 addq(r, r12_heapbase); 5299 bind(done); 5300 } 5301 verify_oop_msg(r, "broken oop in decode_heap_oop"); 5302 } 5303 5304 void MacroAssembler::decode_heap_oop_not_null(Register r) { 5305 // Note: it will change flags 5306 assert (UseCompressedOops, "should only be used for compressed headers"); 5307 assert (Universe::heap() != NULL, "java heap should be initialized"); 5308 // Cannot assert, unverified entry point counts instructions (see .ad file) 5309 // vtableStubs also counts instructions in pd_code_size_limit. 5310 // Also do not verify_oop as this is called by verify_oop. 5311 if (CompressedOops::shift() != 0) { 5312 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5313 shlq(r, LogMinObjAlignmentInBytes); 5314 if (CompressedOops::base() != NULL) { 5315 addq(r, r12_heapbase); 5316 } 5317 } else { 5318 assert (CompressedOops::base() == NULL, "sanity"); 5319 } 5320 } 5321 5322 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 5323 // Note: it will change flags 5324 assert (UseCompressedOops, "should only be used for compressed headers"); 5325 assert (Universe::heap() != NULL, "java heap should be initialized"); 5326 // Cannot assert, unverified entry point counts instructions (see .ad file) 5327 // vtableStubs also counts instructions in pd_code_size_limit. 5328 // Also do not verify_oop as this is called by verify_oop. 5329 if (CompressedOops::shift() != 0) { 5330 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5331 if (LogMinObjAlignmentInBytes == Address::times_8) { 5332 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0)); 5333 } else { 5334 if (dst != src) { 5335 movq(dst, src); 5336 } 5337 shlq(dst, LogMinObjAlignmentInBytes); 5338 if (CompressedOops::base() != NULL) { 5339 addq(dst, r12_heapbase); 5340 } 5341 } 5342 } else { 5343 assert (CompressedOops::base() == NULL, "sanity"); 5344 if (dst != src) { 5345 movq(dst, src); 5346 } 5347 } 5348 } 5349 5350 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) { 5351 assert_different_registers(r, tmp); 5352 if (CompressedKlassPointers::base() != NULL) { 5353 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 5354 subq(r, tmp); 5355 } 5356 if (CompressedKlassPointers::shift() != 0) { 5357 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 5358 shrq(r, LogKlassAlignmentInBytes); 5359 } 5360 } 5361 5362 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) { 5363 assert_different_registers(src, dst); 5364 if (CompressedKlassPointers::base() != NULL) { 5365 mov64(dst, -(int64_t)CompressedKlassPointers::base()); 5366 addq(dst, src); 5367 } else { 5368 movptr(dst, src); 5369 } 5370 if (CompressedKlassPointers::shift() != 0) { 5371 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 5372 shrq(dst, LogKlassAlignmentInBytes); 5373 } 5374 } 5375 5376 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) { 5377 assert_different_registers(r, tmp); 5378 // Note: it will change flags 5379 assert(UseCompressedClassPointers, "should only be used for compressed headers"); 5380 // Cannot assert, unverified entry point counts instructions (see .ad file) 5381 // vtableStubs also counts instructions in pd_code_size_limit. 5382 // Also do not verify_oop as this is called by verify_oop. 5383 if (CompressedKlassPointers::shift() != 0) { 5384 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 5385 shlq(r, LogKlassAlignmentInBytes); 5386 } 5387 if (CompressedKlassPointers::base() != NULL) { 5388 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 5389 addq(r, tmp); 5390 } 5391 } 5392 5393 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) { 5394 assert_different_registers(src, dst); 5395 // Note: it will change flags 5396 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5397 // Cannot assert, unverified entry point counts instructions (see .ad file) 5398 // vtableStubs also counts instructions in pd_code_size_limit. 5399 // Also do not verify_oop as this is called by verify_oop. 5400 5401 if (CompressedKlassPointers::base() == NULL && 5402 CompressedKlassPointers::shift() == 0) { 5403 // The best case scenario is that there is no base or shift. Then it is already 5404 // a pointer that needs nothing but a register rename. 5405 movl(dst, src); 5406 } else { 5407 if (CompressedKlassPointers::base() != NULL) { 5408 mov64(dst, (int64_t)CompressedKlassPointers::base()); 5409 } else { 5410 xorq(dst, dst); 5411 } 5412 if (CompressedKlassPointers::shift() != 0) { 5413 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 5414 assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?"); 5415 leaq(dst, Address(dst, src, Address::times_8, 0)); 5416 } else { 5417 addq(dst, src); 5418 } 5419 } 5420 } 5421 5422 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 5423 assert (UseCompressedOops, "should only be used for compressed headers"); 5424 assert (Universe::heap() != NULL, "java heap should be initialized"); 5425 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5426 int oop_index = oop_recorder()->find_index(obj); 5427 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5428 mov_narrow_oop(dst, oop_index, rspec); 5429 } 5430 5431 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) { 5432 assert (UseCompressedOops, "should only be used for compressed headers"); 5433 assert (Universe::heap() != NULL, "java heap should be initialized"); 5434 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5435 int oop_index = oop_recorder()->find_index(obj); 5436 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5437 mov_narrow_oop(dst, oop_index, rspec); 5438 } 5439 5440 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 5441 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5442 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5443 int klass_index = oop_recorder()->find_index(k); 5444 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 5445 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 5446 } 5447 5448 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) { 5449 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5450 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5451 int klass_index = oop_recorder()->find_index(k); 5452 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 5453 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 5454 } 5455 5456 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) { 5457 assert (UseCompressedOops, "should only be used for compressed headers"); 5458 assert (Universe::heap() != NULL, "java heap should be initialized"); 5459 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5460 int oop_index = oop_recorder()->find_index(obj); 5461 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5462 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 5463 } 5464 5465 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) { 5466 assert (UseCompressedOops, "should only be used for compressed headers"); 5467 assert (Universe::heap() != NULL, "java heap should be initialized"); 5468 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5469 int oop_index = oop_recorder()->find_index(obj); 5470 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5471 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 5472 } 5473 5474 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) { 5475 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5476 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5477 int klass_index = oop_recorder()->find_index(k); 5478 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 5479 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 5480 } 5481 5482 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) { 5483 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5484 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5485 int klass_index = oop_recorder()->find_index(k); 5486 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 5487 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 5488 } 5489 5490 void MacroAssembler::reinit_heapbase() { 5491 if (UseCompressedOops) { 5492 if (Universe::heap() != NULL) { 5493 if (CompressedOops::base() == NULL) { 5494 MacroAssembler::xorptr(r12_heapbase, r12_heapbase); 5495 } else { 5496 mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base()); 5497 } 5498 } else { 5499 movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 5500 } 5501 } 5502 } 5503 5504 #endif // _LP64 5505 5506 #ifdef COMPILER2 5507 // C2 compiled method's prolog code. 5508 void MacroAssembler::verified_entry(Compile* C, int sp_inc) { 5509 int framesize = C->output()->frame_size_in_bytes(); 5510 int bangsize = C->output()->bang_size_in_bytes(); 5511 bool fp_mode_24b = false; 5512 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 5513 5514 // WARNING: Initial instruction MUST be 5 bytes or longer so that 5515 // NativeJump::patch_verified_entry will be able to patch out the entry 5516 // code safely. The push to verify stack depth is ok at 5 bytes, 5517 // the frame allocation can be either 3 or 6 bytes. So if we don't do 5518 // stack bang then we must use the 6 byte frame allocation even if 5519 // we have no frame. :-( 5520 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 5521 5522 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 5523 // Remove word for return addr 5524 framesize -= wordSize; 5525 stack_bang_size -= wordSize; 5526 5527 // Calls to C2R adapters often do not accept exceptional returns. 5528 // We require that their callers must bang for them. But be careful, because 5529 // some VM calls (such as call site linkage) can use several kilobytes of 5530 // stack. But the stack safety zone should account for that. 5531 // See bugs 4446381, 4468289, 4497237. 5532 if (stack_bang_size > 0) { 5533 generate_stack_overflow_check(stack_bang_size); 5534 5535 // We always push rbp, so that on return to interpreter rbp, will be 5536 // restored correctly and we can correct the stack. 5537 push(rbp); 5538 // Save caller's stack pointer into RBP if the frame pointer is preserved. 5539 if (PreserveFramePointer) { 5540 mov(rbp, rsp); 5541 } 5542 // Remove word for ebp 5543 framesize -= wordSize; 5544 5545 // Create frame 5546 if (framesize) { 5547 subptr(rsp, framesize); 5548 } 5549 } else { 5550 // Create frame (force generation of a 4 byte immediate value) 5551 subptr_imm32(rsp, framesize); 5552 5553 // Save RBP register now. 5554 framesize -= wordSize; 5555 movptr(Address(rsp, framesize), rbp); 5556 // Save caller's stack pointer into RBP if the frame pointer is preserved. 5557 if (PreserveFramePointer) { 5558 movptr(rbp, rsp); 5559 if (framesize > 0) { 5560 addptr(rbp, framesize); 5561 } 5562 } 5563 } 5564 5565 if (C->needs_stack_repair()) { 5566 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 5567 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 5568 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 5569 } 5570 5571 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 5572 framesize -= wordSize; 5573 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 5574 } 5575 5576 #ifndef _LP64 5577 // If method sets FPU control word do it now 5578 if (fp_mode_24b) { 5579 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 5580 } 5581 if (UseSSE >= 2 && VerifyFPU) { 5582 verify_FPU(0, "FPU stack must be clean on entry"); 5583 } 5584 #endif 5585 5586 #ifdef ASSERT 5587 if (VerifyStackAtCalls) { 5588 Label L; 5589 push(rax); 5590 mov(rax, rsp); 5591 andptr(rax, StackAlignmentInBytes-1); 5592 cmpptr(rax, StackAlignmentInBytes-wordSize); 5593 pop(rax); 5594 jcc(Assembler::equal, L); 5595 STOP("Stack is not properly aligned!"); 5596 bind(L); 5597 } 5598 #endif 5599 } 5600 #endif // COMPILER2 5601 5602 #if COMPILER2_OR_JVMCI 5603 5604 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers 5605 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, KRegister mask) { 5606 // cnt - number of qwords (8-byte words). 5607 // base - start address, qword aligned. 5608 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end; 5609 bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0); 5610 if (use64byteVector) { 5611 evpbroadcastq(xtmp, val, AVX_512bit); 5612 } else if (MaxVectorSize >= 32) { 5613 movdq(xtmp, val); 5614 punpcklqdq(xtmp, xtmp); 5615 vinserti128_high(xtmp, xtmp); 5616 } else { 5617 movdq(xtmp, val); 5618 punpcklqdq(xtmp, xtmp); 5619 } 5620 jmp(L_zero_64_bytes); 5621 5622 BIND(L_loop); 5623 if (MaxVectorSize >= 32) { 5624 fill64(base, 0, xtmp, use64byteVector); 5625 } else { 5626 movdqu(Address(base, 0), xtmp); 5627 movdqu(Address(base, 16), xtmp); 5628 movdqu(Address(base, 32), xtmp); 5629 movdqu(Address(base, 48), xtmp); 5630 } 5631 addptr(base, 64); 5632 5633 BIND(L_zero_64_bytes); 5634 subptr(cnt, 8); 5635 jccb(Assembler::greaterEqual, L_loop); 5636 5637 // Copy trailing 64 bytes 5638 if (use64byteVector) { 5639 addptr(cnt, 8); 5640 jccb(Assembler::equal, L_end); 5641 fill64_masked(3, base, 0, xtmp, mask, cnt, val, true); 5642 jmp(L_end); 5643 } else { 5644 addptr(cnt, 4); 5645 jccb(Assembler::less, L_tail); 5646 if (MaxVectorSize >= 32) { 5647 vmovdqu(Address(base, 0), xtmp); 5648 } else { 5649 movdqu(Address(base, 0), xtmp); 5650 movdqu(Address(base, 16), xtmp); 5651 } 5652 } 5653 addptr(base, 32); 5654 subptr(cnt, 4); 5655 5656 BIND(L_tail); 5657 addptr(cnt, 4); 5658 jccb(Assembler::lessEqual, L_end); 5659 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) { 5660 fill32_masked(3, base, 0, xtmp, mask, cnt, val); 5661 } else { 5662 decrement(cnt); 5663 5664 BIND(L_sloop); 5665 movq(Address(base, 0), xtmp); 5666 addptr(base, 8); 5667 decrement(cnt); 5668 jccb(Assembler::greaterEqual, L_sloop); 5669 } 5670 BIND(L_end); 5671 } 5672 5673 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) { 5674 assert(InlineTypeReturnedAsFields, "Inline types should never be returned as fields"); 5675 // An inline type might be returned. If fields are in registers we 5676 // need to allocate an inline type instance and initialize it with 5677 // the value of the fields. 5678 Label skip; 5679 // We only need a new buffered inline type if a new one is not returned 5680 testptr(rax, 1); 5681 jcc(Assembler::zero, skip); 5682 int call_offset = -1; 5683 5684 #ifdef _LP64 5685 // The following code is similar to allocate_instance but has some slight differences, 5686 // e.g. object size is always not zero, sometimes it's constant; storing klass ptr after 5687 // allocating is not necessary if vk != NULL, etc. allocate_instance is not aware of these. 5688 Label slow_case; 5689 // 1. Try to allocate a new buffered inline instance either from TLAB or eden space 5690 mov(rscratch1, rax); // save rax for slow_case since *_allocate may corrupt it when allocation failed 5691 if (vk != NULL) { 5692 // Called from C1, where the return type is statically known. 5693 movptr(rbx, (intptr_t)vk->get_InlineKlass()); 5694 jint obj_size = vk->layout_helper(); 5695 assert(obj_size != Klass::_lh_neutral_value, "inline class in return type must have been resolved"); 5696 if (UseTLAB) { 5697 tlab_allocate(r15_thread, rax, noreg, obj_size, r13, r14, slow_case); 5698 } else { 5699 eden_allocate(r15_thread, rax, noreg, obj_size, r13, slow_case); 5700 } 5701 } else { 5702 // Call from interpreter. RAX contains ((the InlineKlass* of the return type) | 0x01) 5703 mov(rbx, rax); 5704 andptr(rbx, -2); 5705 movl(r14, Address(rbx, Klass::layout_helper_offset())); 5706 if (UseTLAB) { 5707 tlab_allocate(r15_thread, rax, r14, 0, r13, r14, slow_case); 5708 } else { 5709 eden_allocate(r15_thread, rax, r14, 0, r13, slow_case); 5710 } 5711 } 5712 if (UseTLAB || Universe::heap()->supports_inline_contig_alloc()) { 5713 // 2. Initialize buffered inline instance header 5714 Register buffer_obj = rax; 5715 movptr(Address(buffer_obj, oopDesc::mark_offset_in_bytes()), (intptr_t)markWord::inline_type_prototype().value()); 5716 xorl(r13, r13); 5717 store_klass_gap(buffer_obj, r13); 5718 if (vk == NULL) { 5719 // store_klass corrupts rbx(klass), so save it in r13 for later use (interpreter case only). 5720 mov(r13, rbx); 5721 } 5722 Register tmp_store_klass = LP64_ONLY(rscratch1) NOT_LP64(noreg); 5723 store_klass(buffer_obj, rbx, tmp_store_klass); 5724 // 3. Initialize its fields with an inline class specific handler 5725 if (vk != NULL) { 5726 call(RuntimeAddress(vk->pack_handler())); // no need for call info as this will not safepoint. 5727 } else { 5728 movptr(rbx, Address(r13, InstanceKlass::adr_inlineklass_fixed_block_offset())); 5729 movptr(rbx, Address(rbx, InlineKlass::pack_handler_offset())); 5730 call(rbx); 5731 } 5732 jmp(skip); 5733 } 5734 bind(slow_case); 5735 // We failed to allocate a new inline type, fall back to a runtime 5736 // call. Some oop field may be live in some registers but we can't 5737 // tell. That runtime call will take care of preserving them 5738 // across a GC if there's one. 5739 mov(rax, rscratch1); 5740 #endif 5741 5742 if (from_interpreter) { 5743 super_call_VM_leaf(StubRoutines::store_inline_type_fields_to_buf()); 5744 } else { 5745 call(RuntimeAddress(StubRoutines::store_inline_type_fields_to_buf())); 5746 call_offset = offset(); 5747 } 5748 5749 bind(skip); 5750 return call_offset; 5751 } 5752 5753 // Move a value between registers/stack slots and update the reg_state 5754 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) { 5755 assert(from->is_valid() && to->is_valid(), "source and destination must be valid"); 5756 if (reg_state[to->value()] == reg_written) { 5757 return true; // Already written 5758 } 5759 if (from != to && bt != T_VOID) { 5760 if (reg_state[to->value()] == reg_readonly) { 5761 return false; // Not yet writable 5762 } 5763 if (from->is_reg()) { 5764 if (to->is_reg()) { 5765 if (from->is_XMMRegister()) { 5766 if (bt == T_DOUBLE) { 5767 movdbl(to->as_XMMRegister(), from->as_XMMRegister()); 5768 } else { 5769 assert(bt == T_FLOAT, "must be float"); 5770 movflt(to->as_XMMRegister(), from->as_XMMRegister()); 5771 } 5772 } else { 5773 movq(to->as_Register(), from->as_Register()); 5774 } 5775 } else { 5776 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 5777 Address to_addr = Address(rsp, st_off); 5778 if (from->is_XMMRegister()) { 5779 if (bt == T_DOUBLE) { 5780 movdbl(to_addr, from->as_XMMRegister()); 5781 } else { 5782 assert(bt == T_FLOAT, "must be float"); 5783 movflt(to_addr, from->as_XMMRegister()); 5784 } 5785 } else { 5786 movq(to_addr, from->as_Register()); 5787 } 5788 } 5789 } else { 5790 Address from_addr = Address(rsp, from->reg2stack() * VMRegImpl::stack_slot_size + wordSize); 5791 if (to->is_reg()) { 5792 if (to->is_XMMRegister()) { 5793 if (bt == T_DOUBLE) { 5794 movdbl(to->as_XMMRegister(), from_addr); 5795 } else { 5796 assert(bt == T_FLOAT, "must be float"); 5797 movflt(to->as_XMMRegister(), from_addr); 5798 } 5799 } else { 5800 movq(to->as_Register(), from_addr); 5801 } 5802 } else { 5803 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 5804 movq(r13, from_addr); 5805 movq(Address(rsp, st_off), r13); 5806 } 5807 } 5808 } 5809 // Update register states 5810 reg_state[from->value()] = reg_writable; 5811 reg_state[to->value()] = reg_written; 5812 return true; 5813 } 5814 5815 // Calculate the extra stack space required for packing or unpacking inline 5816 // args and adjust the stack pointer 5817 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) { 5818 // Two additional slots to account for return address 5819 int sp_inc = (args_on_stack + 2) * VMRegImpl::stack_slot_size; 5820 sp_inc = align_up(sp_inc, StackAlignmentInBytes); 5821 // Save the return address, adjust the stack (make sure it is properly 5822 // 16-byte aligned) and copy the return address to the new top of the stack. 5823 // The stack will be repaired on return (see MacroAssembler::remove_frame). 5824 assert(sp_inc > 0, "sanity"); 5825 pop(r13); 5826 subptr(rsp, sp_inc); 5827 push(r13); 5828 return sp_inc; 5829 } 5830 5831 // Read all fields from an inline type buffer and store the field values in registers/stack slots. 5832 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, 5833 VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index, 5834 RegState reg_state[]) { 5835 assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter"); 5836 assert(from->is_valid(), "source must be valid"); 5837 bool progress = false; 5838 #ifdef ASSERT 5839 const int start_offset = offset(); 5840 #endif 5841 5842 Label L_null, L_notNull; 5843 // Don't use r14 as tmp because it's used for spilling (see MacroAssembler::spill_reg_for) 5844 Register tmp1 = r10; 5845 Register tmp2 = r13; 5846 Register fromReg = noreg; 5847 ScalarizedInlineArgsStream stream(sig, sig_index, to, to_count, to_index, -1); 5848 bool done = true; 5849 bool mark_done = true; 5850 VMReg toReg; 5851 BasicType bt; 5852 // Check if argument requires a null check 5853 bool null_check = false; 5854 VMReg nullCheckReg; 5855 while (stream.next(nullCheckReg, bt)) { 5856 if (sig->at(stream.sig_index())._offset == -1) { 5857 null_check = true; 5858 break; 5859 } 5860 } 5861 stream.reset(sig_index, to_index); 5862 while (stream.next(toReg, bt)) { 5863 assert(toReg->is_valid(), "destination must be valid"); 5864 int idx = (int)toReg->value(); 5865 if (reg_state[idx] == reg_readonly) { 5866 if (idx != from->value()) { 5867 mark_done = false; 5868 } 5869 done = false; 5870 continue; 5871 } else if (reg_state[idx] == reg_written) { 5872 continue; 5873 } 5874 assert(reg_state[idx] == reg_writable, "must be writable"); 5875 reg_state[idx] = reg_written; 5876 progress = true; 5877 5878 if (fromReg == noreg) { 5879 if (from->is_reg()) { 5880 fromReg = from->as_Register(); 5881 } else { 5882 int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 5883 movq(tmp1, Address(rsp, st_off)); 5884 fromReg = tmp1; 5885 } 5886 if (null_check) { 5887 // Nullable inline type argument, emit null check 5888 testptr(fromReg, fromReg); 5889 jcc(Assembler::zero, L_null); 5890 } 5891 } 5892 int off = sig->at(stream.sig_index())._offset; 5893 if (off == -1) { 5894 assert(null_check, "Missing null check at"); 5895 if (toReg->is_stack()) { 5896 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 5897 movq(Address(rsp, st_off), 1); 5898 } else { 5899 movq(toReg->as_Register(), 1); 5900 } 5901 continue; 5902 } 5903 assert(off > 0, "offset in object should be positive"); 5904 Address fromAddr = Address(fromReg, off); 5905 if (!toReg->is_XMMRegister()) { 5906 Register dst = toReg->is_stack() ? tmp2 : toReg->as_Register(); 5907 if (is_reference_type(bt)) { 5908 load_heap_oop(dst, fromAddr); 5909 } else { 5910 bool is_signed = (bt != T_CHAR) && (bt != T_BOOLEAN); 5911 load_sized_value(dst, fromAddr, type2aelembytes(bt), is_signed); 5912 } 5913 if (toReg->is_stack()) { 5914 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 5915 movq(Address(rsp, st_off), dst); 5916 } 5917 } else if (bt == T_DOUBLE) { 5918 movdbl(toReg->as_XMMRegister(), fromAddr); 5919 } else { 5920 assert(bt == T_FLOAT, "must be float"); 5921 movflt(toReg->as_XMMRegister(), fromAddr); 5922 } 5923 } 5924 if (progress && null_check) { 5925 if (done) { 5926 jmp(L_notNull); 5927 bind(L_null); 5928 // Set IsInit field to zero to signal that the argument is null. 5929 // Also set all oop fields to zero to make the GC happy. 5930 stream.reset(sig_index, to_index); 5931 while (stream.next(toReg, bt)) { 5932 if (sig->at(stream.sig_index())._offset == -1 || 5933 bt == T_OBJECT || bt == T_ARRAY) { 5934 if (toReg->is_stack()) { 5935 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 5936 movq(Address(rsp, st_off), 0); 5937 } else { 5938 xorq(toReg->as_Register(), toReg->as_Register()); 5939 } 5940 } 5941 } 5942 bind(L_notNull); 5943 } else { 5944 bind(L_null); 5945 } 5946 } 5947 5948 sig_index = stream.sig_index(); 5949 to_index = stream.regs_index(); 5950 5951 if (mark_done && reg_state[from->value()] != reg_written) { 5952 // This is okay because no one else will write to that slot 5953 reg_state[from->value()] = reg_writable; 5954 } 5955 from_index--; 5956 assert(progress || (start_offset == offset()), "should not emit code"); 5957 return done; 5958 } 5959 5960 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index, 5961 VMRegPair* from, int from_count, int& from_index, VMReg to, 5962 RegState reg_state[], Register val_array) { 5963 assert(sig->at(sig_index)._bt == T_PRIMITIVE_OBJECT, "should be at end delimiter"); 5964 assert(to->is_valid(), "destination must be valid"); 5965 5966 if (reg_state[to->value()] == reg_written) { 5967 skip_unpacked_fields(sig, sig_index, from, from_count, from_index); 5968 return true; // Already written 5969 } 5970 5971 // TODO 8284443 Isn't it an issue if below code uses r14 as tmp when it contains a spilled value? 5972 // Be careful with r14 because it's used for spilling (see MacroAssembler::spill_reg_for). 5973 Register val_obj_tmp = r11; 5974 Register from_reg_tmp = r14; 5975 Register tmp1 = r10; 5976 Register tmp2 = r13; 5977 Register tmp3 = rbx; 5978 Register val_obj = to->is_stack() ? val_obj_tmp : to->as_Register(); 5979 5980 assert_different_registers(val_obj_tmp, from_reg_tmp, tmp1, tmp2, tmp3, val_array); 5981 5982 if (reg_state[to->value()] == reg_readonly) { 5983 if (!is_reg_in_unpacked_fields(sig, sig_index, to, from, from_count, from_index)) { 5984 skip_unpacked_fields(sig, sig_index, from, from_count, from_index); 5985 return false; // Not yet writable 5986 } 5987 val_obj = val_obj_tmp; 5988 } 5989 5990 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + vtarg_index * type2aelembytes(T_PRIMITIVE_OBJECT); 5991 load_heap_oop(val_obj, Address(val_array, index)); 5992 5993 ScalarizedInlineArgsStream stream(sig, sig_index, from, from_count, from_index); 5994 VMReg fromReg; 5995 BasicType bt; 5996 Label L_null; 5997 while (stream.next(fromReg, bt)) { 5998 assert(fromReg->is_valid(), "source must be valid"); 5999 reg_state[fromReg->value()] = reg_writable; 6000 6001 int off = sig->at(stream.sig_index())._offset; 6002 if (off == -1) { 6003 // Nullable inline type argument, emit null check 6004 Label L_notNull; 6005 if (fromReg->is_stack()) { 6006 int ld_off = fromReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 6007 testb(Address(rsp, ld_off), 1); 6008 } else { 6009 testb(fromReg->as_Register(), 1); 6010 } 6011 jcc(Assembler::notZero, L_notNull); 6012 movptr(val_obj, 0); 6013 jmp(L_null); 6014 bind(L_notNull); 6015 continue; 6016 } 6017 6018 assert(off > 0, "offset in object should be positive"); 6019 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize; 6020 6021 Address dst(val_obj, off); 6022 if (!fromReg->is_XMMRegister()) { 6023 Register src; 6024 if (fromReg->is_stack()) { 6025 src = from_reg_tmp; 6026 int ld_off = fromReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 6027 load_sized_value(src, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false); 6028 } else { 6029 src = fromReg->as_Register(); 6030 } 6031 assert_different_registers(dst.base(), src, tmp1, tmp2, tmp3, val_array); 6032 if (is_reference_type(bt)) { 6033 store_heap_oop(dst, src, tmp1, tmp2, tmp3, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED); 6034 } else { 6035 store_sized_value(dst, src, size_in_bytes); 6036 } 6037 } else if (bt == T_DOUBLE) { 6038 movdbl(dst, fromReg->as_XMMRegister()); 6039 } else { 6040 assert(bt == T_FLOAT, "must be float"); 6041 movflt(dst, fromReg->as_XMMRegister()); 6042 } 6043 } 6044 bind(L_null); 6045 sig_index = stream.sig_index(); 6046 from_index = stream.regs_index(); 6047 6048 assert(reg_state[to->value()] == reg_writable, "must have already been read"); 6049 bool success = move_helper(val_obj->as_VMReg(), to, T_OBJECT, reg_state); 6050 assert(success, "to register must be writeable"); 6051 return true; 6052 } 6053 6054 VMReg MacroAssembler::spill_reg_for(VMReg reg) { 6055 return reg->is_XMMRegister() ? xmm8->as_VMReg() : r14->as_VMReg(); 6056 } 6057 6058 void MacroAssembler::remove_frame(int initial_framesize, bool needs_stack_repair) { 6059 assert((initial_framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 6060 if (needs_stack_repair) { 6061 movq(rbp, Address(rsp, initial_framesize)); 6062 // The stack increment resides just below the saved rbp 6063 addq(rsp, Address(rsp, initial_framesize - wordSize)); 6064 } else { 6065 if (initial_framesize > 0) { 6066 addq(rsp, initial_framesize); 6067 } 6068 pop(rbp); 6069 } 6070 } 6071 6072 // Clearing constant sized memory using YMM/ZMM registers. 6073 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) { 6074 assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), ""); 6075 bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); 6076 6077 int vector64_count = (cnt & (~0x7)) >> 3; 6078 cnt = cnt & 0x7; 6079 6080 // 64 byte initialization loop. 6081 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit); 6082 for (int i = 0; i < vector64_count; i++) { 6083 fill64(base, i * 64, xtmp, use64byteVector); 6084 } 6085 6086 // Clear remaining 64 byte tail. 6087 int disp = vector64_count * 64; 6088 if (cnt) { 6089 switch (cnt) { 6090 case 1: 6091 movq(Address(base, disp), xtmp); 6092 break; 6093 case 2: 6094 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit); 6095 break; 6096 case 3: 6097 movl(rtmp, 0x7); 6098 kmovwl(mask, rtmp); 6099 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit); 6100 break; 6101 case 4: 6102 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit); 6103 break; 6104 case 5: 6105 if (use64byteVector) { 6106 movl(rtmp, 0x1F); 6107 kmovwl(mask, rtmp); 6108 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit); 6109 } else { 6110 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit); 6111 movq(Address(base, disp + 32), xtmp); 6112 } 6113 break; 6114 case 6: 6115 if (use64byteVector) { 6116 movl(rtmp, 0x3F); 6117 kmovwl(mask, rtmp); 6118 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit); 6119 } else { 6120 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit); 6121 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit); 6122 } 6123 break; 6124 case 7: 6125 if (use64byteVector) { 6126 movl(rtmp, 0x7F); 6127 kmovwl(mask, rtmp); 6128 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit); 6129 } else { 6130 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit); 6131 movl(rtmp, 0x7); 6132 kmovwl(mask, rtmp); 6133 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit); 6134 } 6135 break; 6136 default: 6137 fatal("Unexpected length : %d\n",cnt); 6138 break; 6139 } 6140 } 6141 } 6142 6143 void MacroAssembler::clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, 6144 bool is_large, bool word_copy_only, KRegister mask) { 6145 // cnt - number of qwords (8-byte words). 6146 // base - start address, qword aligned. 6147 // is_large - if optimizers know cnt is larger than InitArrayShortSize 6148 assert(base==rdi, "base register must be edi for rep stos"); 6149 assert(val==rax, "val register must be eax for rep stos"); 6150 assert(cnt==rcx, "cnt register must be ecx for rep stos"); 6151 assert(InitArrayShortSize % BytesPerLong == 0, 6152 "InitArrayShortSize should be the multiple of BytesPerLong"); 6153 6154 Label DONE; 6155 6156 if (!is_large) { 6157 Label LOOP, LONG; 6158 cmpptr(cnt, InitArrayShortSize/BytesPerLong); 6159 jccb(Assembler::greater, LONG); 6160 6161 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 6162 6163 decrement(cnt); 6164 jccb(Assembler::negative, DONE); // Zero length 6165 6166 // Use individual pointer-sized stores for small counts: 6167 BIND(LOOP); 6168 movptr(Address(base, cnt, Address::times_ptr), val); 6169 decrement(cnt); 6170 jccb(Assembler::greaterEqual, LOOP); 6171 jmpb(DONE); 6172 6173 BIND(LONG); 6174 } 6175 6176 // Use longer rep-prefixed ops for non-small counts: 6177 if (UseFastStosb && !word_copy_only) { 6178 shlptr(cnt, 3); // convert to number of bytes 6179 rep_stosb(); 6180 } else if (UseXMMForObjInit) { 6181 xmm_clear_mem(base, cnt, val, xtmp, mask); 6182 } else { 6183 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 6184 rep_stos(); 6185 } 6186 6187 BIND(DONE); 6188 } 6189 6190 #endif //COMPILER2_OR_JVMCI 6191 6192 6193 void MacroAssembler::generate_fill(BasicType t, bool aligned, 6194 Register to, Register value, Register count, 6195 Register rtmp, XMMRegister xtmp) { 6196 ShortBranchVerifier sbv(this); 6197 assert_different_registers(to, value, count, rtmp); 6198 Label L_exit; 6199 Label L_fill_2_bytes, L_fill_4_bytes; 6200 6201 #if defined(COMPILER2) && defined(_LP64) 6202 if(MaxVectorSize >=32 && 6203 VM_Version::supports_avx512vlbw() && 6204 VM_Version::supports_bmi2()) { 6205 generate_fill_avx3(t, to, value, count, rtmp, xtmp); 6206 return; 6207 } 6208 #endif 6209 6210 int shift = -1; 6211 switch (t) { 6212 case T_BYTE: 6213 shift = 2; 6214 break; 6215 case T_SHORT: 6216 shift = 1; 6217 break; 6218 case T_INT: 6219 shift = 0; 6220 break; 6221 default: ShouldNotReachHere(); 6222 } 6223 6224 if (t == T_BYTE) { 6225 andl(value, 0xff); 6226 movl(rtmp, value); 6227 shll(rtmp, 8); 6228 orl(value, rtmp); 6229 } 6230 if (t == T_SHORT) { 6231 andl(value, 0xffff); 6232 } 6233 if (t == T_BYTE || t == T_SHORT) { 6234 movl(rtmp, value); 6235 shll(rtmp, 16); 6236 orl(value, rtmp); 6237 } 6238 6239 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 6240 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp 6241 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { 6242 Label L_skip_align2; 6243 // align source address at 4 bytes address boundary 6244 if (t == T_BYTE) { 6245 Label L_skip_align1; 6246 // One byte misalignment happens only for byte arrays 6247 testptr(to, 1); 6248 jccb(Assembler::zero, L_skip_align1); 6249 movb(Address(to, 0), value); 6250 increment(to); 6251 decrement(count); 6252 BIND(L_skip_align1); 6253 } 6254 // Two bytes misalignment happens only for byte and short (char) arrays 6255 testptr(to, 2); 6256 jccb(Assembler::zero, L_skip_align2); 6257 movw(Address(to, 0), value); 6258 addptr(to, 2); 6259 subl(count, 1<<(shift-1)); 6260 BIND(L_skip_align2); 6261 } 6262 if (UseSSE < 2) { 6263 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 6264 // Fill 32-byte chunks 6265 subl(count, 8 << shift); 6266 jcc(Assembler::less, L_check_fill_8_bytes); 6267 align(16); 6268 6269 BIND(L_fill_32_bytes_loop); 6270 6271 for (int i = 0; i < 32; i += 4) { 6272 movl(Address(to, i), value); 6273 } 6274 6275 addptr(to, 32); 6276 subl(count, 8 << shift); 6277 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 6278 BIND(L_check_fill_8_bytes); 6279 addl(count, 8 << shift); 6280 jccb(Assembler::zero, L_exit); 6281 jmpb(L_fill_8_bytes); 6282 6283 // 6284 // length is too short, just fill qwords 6285 // 6286 BIND(L_fill_8_bytes_loop); 6287 movl(Address(to, 0), value); 6288 movl(Address(to, 4), value); 6289 addptr(to, 8); 6290 BIND(L_fill_8_bytes); 6291 subl(count, 1 << (shift + 1)); 6292 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 6293 // fall through to fill 4 bytes 6294 } else { 6295 Label L_fill_32_bytes; 6296 if (!UseUnalignedLoadStores) { 6297 // align to 8 bytes, we know we are 4 byte aligned to start 6298 testptr(to, 4); 6299 jccb(Assembler::zero, L_fill_32_bytes); 6300 movl(Address(to, 0), value); 6301 addptr(to, 4); 6302 subl(count, 1<<shift); 6303 } 6304 BIND(L_fill_32_bytes); 6305 { 6306 assert( UseSSE >= 2, "supported cpu only" ); 6307 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 6308 movdl(xtmp, value); 6309 if (UseAVX >= 2 && UseUnalignedLoadStores) { 6310 Label L_check_fill_32_bytes; 6311 if (UseAVX > 2) { 6312 // Fill 64-byte chunks 6313 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2; 6314 6315 // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2 6316 cmpl(count, VM_Version::avx3_threshold()); 6317 jccb(Assembler::below, L_check_fill_64_bytes_avx2); 6318 6319 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); 6320 6321 subl(count, 16 << shift); 6322 jccb(Assembler::less, L_check_fill_32_bytes); 6323 align(16); 6324 6325 BIND(L_fill_64_bytes_loop_avx3); 6326 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit); 6327 addptr(to, 64); 6328 subl(count, 16 << shift); 6329 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3); 6330 jmpb(L_check_fill_32_bytes); 6331 6332 BIND(L_check_fill_64_bytes_avx2); 6333 } 6334 // Fill 64-byte chunks 6335 Label L_fill_64_bytes_loop; 6336 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit); 6337 6338 subl(count, 16 << shift); 6339 jcc(Assembler::less, L_check_fill_32_bytes); 6340 align(16); 6341 6342 BIND(L_fill_64_bytes_loop); 6343 vmovdqu(Address(to, 0), xtmp); 6344 vmovdqu(Address(to, 32), xtmp); 6345 addptr(to, 64); 6346 subl(count, 16 << shift); 6347 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); 6348 6349 BIND(L_check_fill_32_bytes); 6350 addl(count, 8 << shift); 6351 jccb(Assembler::less, L_check_fill_8_bytes); 6352 vmovdqu(Address(to, 0), xtmp); 6353 addptr(to, 32); 6354 subl(count, 8 << shift); 6355 6356 BIND(L_check_fill_8_bytes); 6357 // clean upper bits of YMM registers 6358 movdl(xtmp, value); 6359 pshufd(xtmp, xtmp, 0); 6360 } else { 6361 // Fill 32-byte chunks 6362 pshufd(xtmp, xtmp, 0); 6363 6364 subl(count, 8 << shift); 6365 jcc(Assembler::less, L_check_fill_8_bytes); 6366 align(16); 6367 6368 BIND(L_fill_32_bytes_loop); 6369 6370 if (UseUnalignedLoadStores) { 6371 movdqu(Address(to, 0), xtmp); 6372 movdqu(Address(to, 16), xtmp); 6373 } else { 6374 movq(Address(to, 0), xtmp); 6375 movq(Address(to, 8), xtmp); 6376 movq(Address(to, 16), xtmp); 6377 movq(Address(to, 24), xtmp); 6378 } 6379 6380 addptr(to, 32); 6381 subl(count, 8 << shift); 6382 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 6383 6384 BIND(L_check_fill_8_bytes); 6385 } 6386 addl(count, 8 << shift); 6387 jccb(Assembler::zero, L_exit); 6388 jmpb(L_fill_8_bytes); 6389 6390 // 6391 // length is too short, just fill qwords 6392 // 6393 BIND(L_fill_8_bytes_loop); 6394 movq(Address(to, 0), xtmp); 6395 addptr(to, 8); 6396 BIND(L_fill_8_bytes); 6397 subl(count, 1 << (shift + 1)); 6398 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 6399 } 6400 } 6401 // fill trailing 4 bytes 6402 BIND(L_fill_4_bytes); 6403 testl(count, 1<<shift); 6404 jccb(Assembler::zero, L_fill_2_bytes); 6405 movl(Address(to, 0), value); 6406 if (t == T_BYTE || t == T_SHORT) { 6407 Label L_fill_byte; 6408 addptr(to, 4); 6409 BIND(L_fill_2_bytes); 6410 // fill trailing 2 bytes 6411 testl(count, 1<<(shift-1)); 6412 jccb(Assembler::zero, L_fill_byte); 6413 movw(Address(to, 0), value); 6414 if (t == T_BYTE) { 6415 addptr(to, 2); 6416 BIND(L_fill_byte); 6417 // fill trailing byte 6418 testl(count, 1); 6419 jccb(Assembler::zero, L_exit); 6420 movb(Address(to, 0), value); 6421 } else { 6422 BIND(L_fill_byte); 6423 } 6424 } else { 6425 BIND(L_fill_2_bytes); 6426 } 6427 BIND(L_exit); 6428 } 6429 6430 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) { 6431 switch(type) { 6432 case T_BYTE: 6433 case T_BOOLEAN: 6434 evpbroadcastb(dst, src, vector_len); 6435 break; 6436 case T_SHORT: 6437 case T_CHAR: 6438 evpbroadcastw(dst, src, vector_len); 6439 break; 6440 case T_INT: 6441 case T_FLOAT: 6442 evpbroadcastd(dst, src, vector_len); 6443 break; 6444 case T_LONG: 6445 case T_DOUBLE: 6446 evpbroadcastq(dst, src, vector_len); 6447 break; 6448 default: 6449 fatal("Unhandled type : %s", type2name(type)); 6450 break; 6451 } 6452 } 6453 6454 // encode char[] to byte[] in ISO_8859_1 or ASCII 6455 //@IntrinsicCandidate 6456 //private static int implEncodeISOArray(byte[] sa, int sp, 6457 //byte[] da, int dp, int len) { 6458 // int i = 0; 6459 // for (; i < len; i++) { 6460 // char c = StringUTF16.getChar(sa, sp++); 6461 // if (c > '\u00FF') 6462 // break; 6463 // da[dp++] = (byte)c; 6464 // } 6465 // return i; 6466 //} 6467 // 6468 //@IntrinsicCandidate 6469 //private static int implEncodeAsciiArray(char[] sa, int sp, 6470 // byte[] da, int dp, int len) { 6471 // int i = 0; 6472 // for (; i < len; i++) { 6473 // char c = sa[sp++]; 6474 // if (c >= '\u0080') 6475 // break; 6476 // da[dp++] = (byte)c; 6477 // } 6478 // return i; 6479 //} 6480 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, 6481 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 6482 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 6483 Register tmp5, Register result, bool ascii) { 6484 6485 // rsi: src 6486 // rdi: dst 6487 // rdx: len 6488 // rcx: tmp5 6489 // rax: result 6490 ShortBranchVerifier sbv(this); 6491 assert_different_registers(src, dst, len, tmp5, result); 6492 Label L_done, L_copy_1_char, L_copy_1_char_exit; 6493 6494 int mask = ascii ? 0xff80ff80 : 0xff00ff00; 6495 int short_mask = ascii ? 0xff80 : 0xff00; 6496 6497 // set result 6498 xorl(result, result); 6499 // check for zero length 6500 testl(len, len); 6501 jcc(Assembler::zero, L_done); 6502 6503 movl(result, len); 6504 6505 // Setup pointers 6506 lea(src, Address(src, len, Address::times_2)); // char[] 6507 lea(dst, Address(dst, len, Address::times_1)); // byte[] 6508 negptr(len); 6509 6510 if (UseSSE42Intrinsics || UseAVX >= 2) { 6511 Label L_copy_8_chars, L_copy_8_chars_exit; 6512 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit; 6513 6514 if (UseAVX >= 2) { 6515 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit; 6516 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector 6517 movdl(tmp1Reg, tmp5); 6518 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit); 6519 jmp(L_chars_32_check); 6520 6521 bind(L_copy_32_chars); 6522 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64)); 6523 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32)); 6524 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 6525 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector 6526 jccb(Assembler::notZero, L_copy_32_chars_exit); 6527 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 6528 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1); 6529 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg); 6530 6531 bind(L_chars_32_check); 6532 addptr(len, 32); 6533 jcc(Assembler::lessEqual, L_copy_32_chars); 6534 6535 bind(L_copy_32_chars_exit); 6536 subptr(len, 16); 6537 jccb(Assembler::greater, L_copy_16_chars_exit); 6538 6539 } else if (UseSSE42Intrinsics) { 6540 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector 6541 movdl(tmp1Reg, tmp5); 6542 pshufd(tmp1Reg, tmp1Reg, 0); 6543 jmpb(L_chars_16_check); 6544 } 6545 6546 bind(L_copy_16_chars); 6547 if (UseAVX >= 2) { 6548 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32)); 6549 vptest(tmp2Reg, tmp1Reg); 6550 jcc(Assembler::notZero, L_copy_16_chars_exit); 6551 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1); 6552 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1); 6553 } else { 6554 if (UseAVX > 0) { 6555 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 6556 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 6557 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0); 6558 } else { 6559 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 6560 por(tmp2Reg, tmp3Reg); 6561 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 6562 por(tmp2Reg, tmp4Reg); 6563 } 6564 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector 6565 jccb(Assembler::notZero, L_copy_16_chars_exit); 6566 packuswb(tmp3Reg, tmp4Reg); 6567 } 6568 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg); 6569 6570 bind(L_chars_16_check); 6571 addptr(len, 16); 6572 jcc(Assembler::lessEqual, L_copy_16_chars); 6573 6574 bind(L_copy_16_chars_exit); 6575 if (UseAVX >= 2) { 6576 // clean upper bits of YMM registers 6577 vpxor(tmp2Reg, tmp2Reg); 6578 vpxor(tmp3Reg, tmp3Reg); 6579 vpxor(tmp4Reg, tmp4Reg); 6580 movdl(tmp1Reg, tmp5); 6581 pshufd(tmp1Reg, tmp1Reg, 0); 6582 } 6583 subptr(len, 8); 6584 jccb(Assembler::greater, L_copy_8_chars_exit); 6585 6586 bind(L_copy_8_chars); 6587 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16)); 6588 ptest(tmp3Reg, tmp1Reg); 6589 jccb(Assembler::notZero, L_copy_8_chars_exit); 6590 packuswb(tmp3Reg, tmp1Reg); 6591 movq(Address(dst, len, Address::times_1, -8), tmp3Reg); 6592 addptr(len, 8); 6593 jccb(Assembler::lessEqual, L_copy_8_chars); 6594 6595 bind(L_copy_8_chars_exit); 6596 subptr(len, 8); 6597 jccb(Assembler::zero, L_done); 6598 } 6599 6600 bind(L_copy_1_char); 6601 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0)); 6602 testl(tmp5, short_mask); // check if Unicode or non-ASCII char 6603 jccb(Assembler::notZero, L_copy_1_char_exit); 6604 movb(Address(dst, len, Address::times_1, 0), tmp5); 6605 addptr(len, 1); 6606 jccb(Assembler::less, L_copy_1_char); 6607 6608 bind(L_copy_1_char_exit); 6609 addptr(result, len); // len is negative count of not processed elements 6610 6611 bind(L_done); 6612 } 6613 6614 #ifdef _LP64 6615 /** 6616 * Helper for multiply_to_len(). 6617 */ 6618 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { 6619 addq(dest_lo, src1); 6620 adcq(dest_hi, 0); 6621 addq(dest_lo, src2); 6622 adcq(dest_hi, 0); 6623 } 6624 6625 /** 6626 * Multiply 64 bit by 64 bit first loop. 6627 */ 6628 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 6629 Register y, Register y_idx, Register z, 6630 Register carry, Register product, 6631 Register idx, Register kdx) { 6632 // 6633 // jlong carry, x[], y[], z[]; 6634 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 6635 // huge_128 product = y[idx] * x[xstart] + carry; 6636 // z[kdx] = (jlong)product; 6637 // carry = (jlong)(product >>> 64); 6638 // } 6639 // z[xstart] = carry; 6640 // 6641 6642 Label L_first_loop, L_first_loop_exit; 6643 Label L_one_x, L_one_y, L_multiply; 6644 6645 decrementl(xstart); 6646 jcc(Assembler::negative, L_one_x); 6647 6648 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 6649 rorq(x_xstart, 32); // convert big-endian to little-endian 6650 6651 bind(L_first_loop); 6652 decrementl(idx); 6653 jcc(Assembler::negative, L_first_loop_exit); 6654 decrementl(idx); 6655 jcc(Assembler::negative, L_one_y); 6656 movq(y_idx, Address(y, idx, Address::times_4, 0)); 6657 rorq(y_idx, 32); // convert big-endian to little-endian 6658 bind(L_multiply); 6659 movq(product, x_xstart); 6660 mulq(y_idx); // product(rax) * y_idx -> rdx:rax 6661 addq(product, carry); 6662 adcq(rdx, 0); 6663 subl(kdx, 2); 6664 movl(Address(z, kdx, Address::times_4, 4), product); 6665 shrq(product, 32); 6666 movl(Address(z, kdx, Address::times_4, 0), product); 6667 movq(carry, rdx); 6668 jmp(L_first_loop); 6669 6670 bind(L_one_y); 6671 movl(y_idx, Address(y, 0)); 6672 jmp(L_multiply); 6673 6674 bind(L_one_x); 6675 movl(x_xstart, Address(x, 0)); 6676 jmp(L_first_loop); 6677 6678 bind(L_first_loop_exit); 6679 } 6680 6681 /** 6682 * Multiply 64 bit by 64 bit and add 128 bit. 6683 */ 6684 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, 6685 Register yz_idx, Register idx, 6686 Register carry, Register product, int offset) { 6687 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 6688 // z[kdx] = (jlong)product; 6689 6690 movq(yz_idx, Address(y, idx, Address::times_4, offset)); 6691 rorq(yz_idx, 32); // convert big-endian to little-endian 6692 movq(product, x_xstart); 6693 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 6694 movq(yz_idx, Address(z, idx, Address::times_4, offset)); 6695 rorq(yz_idx, 32); // convert big-endian to little-endian 6696 6697 add2_with_carry(rdx, product, carry, yz_idx); 6698 6699 movl(Address(z, idx, Address::times_4, offset+4), product); 6700 shrq(product, 32); 6701 movl(Address(z, idx, Address::times_4, offset), product); 6702 6703 } 6704 6705 /** 6706 * Multiply 128 bit by 128 bit. Unrolled inner loop. 6707 */ 6708 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, 6709 Register yz_idx, Register idx, Register jdx, 6710 Register carry, Register product, 6711 Register carry2) { 6712 // jlong carry, x[], y[], z[]; 6713 // int kdx = ystart+1; 6714 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 6715 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 6716 // z[kdx+idx+1] = (jlong)product; 6717 // jlong carry2 = (jlong)(product >>> 64); 6718 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 6719 // z[kdx+idx] = (jlong)product; 6720 // carry = (jlong)(product >>> 64); 6721 // } 6722 // idx += 2; 6723 // if (idx > 0) { 6724 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 6725 // z[kdx+idx] = (jlong)product; 6726 // carry = (jlong)(product >>> 64); 6727 // } 6728 // 6729 6730 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 6731 6732 movl(jdx, idx); 6733 andl(jdx, 0xFFFFFFFC); 6734 shrl(jdx, 2); 6735 6736 bind(L_third_loop); 6737 subl(jdx, 1); 6738 jcc(Assembler::negative, L_third_loop_exit); 6739 subl(idx, 4); 6740 6741 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); 6742 movq(carry2, rdx); 6743 6744 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); 6745 movq(carry, rdx); 6746 jmp(L_third_loop); 6747 6748 bind (L_third_loop_exit); 6749 6750 andl (idx, 0x3); 6751 jcc(Assembler::zero, L_post_third_loop_done); 6752 6753 Label L_check_1; 6754 subl(idx, 2); 6755 jcc(Assembler::negative, L_check_1); 6756 6757 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); 6758 movq(carry, rdx); 6759 6760 bind (L_check_1); 6761 addl (idx, 0x2); 6762 andl (idx, 0x1); 6763 subl(idx, 1); 6764 jcc(Assembler::negative, L_post_third_loop_done); 6765 6766 movl(yz_idx, Address(y, idx, Address::times_4, 0)); 6767 movq(product, x_xstart); 6768 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 6769 movl(yz_idx, Address(z, idx, Address::times_4, 0)); 6770 6771 add2_with_carry(rdx, product, yz_idx, carry); 6772 6773 movl(Address(z, idx, Address::times_4, 0), product); 6774 shrq(product, 32); 6775 6776 shlq(rdx, 32); 6777 orq(product, rdx); 6778 movq(carry, product); 6779 6780 bind(L_post_third_loop_done); 6781 } 6782 6783 /** 6784 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. 6785 * 6786 */ 6787 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, 6788 Register carry, Register carry2, 6789 Register idx, Register jdx, 6790 Register yz_idx1, Register yz_idx2, 6791 Register tmp, Register tmp3, Register tmp4) { 6792 assert(UseBMI2Instructions, "should be used only when BMI2 is available"); 6793 6794 // jlong carry, x[], y[], z[]; 6795 // int kdx = ystart+1; 6796 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 6797 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; 6798 // jlong carry2 = (jlong)(tmp3 >>> 64); 6799 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; 6800 // carry = (jlong)(tmp4 >>> 64); 6801 // z[kdx+idx+1] = (jlong)tmp3; 6802 // z[kdx+idx] = (jlong)tmp4; 6803 // } 6804 // idx += 2; 6805 // if (idx > 0) { 6806 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; 6807 // z[kdx+idx] = (jlong)yz_idx1; 6808 // carry = (jlong)(yz_idx1 >>> 64); 6809 // } 6810 // 6811 6812 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 6813 6814 movl(jdx, idx); 6815 andl(jdx, 0xFFFFFFFC); 6816 shrl(jdx, 2); 6817 6818 bind(L_third_loop); 6819 subl(jdx, 1); 6820 jcc(Assembler::negative, L_third_loop_exit); 6821 subl(idx, 4); 6822 6823 movq(yz_idx1, Address(y, idx, Address::times_4, 8)); 6824 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 6825 movq(yz_idx2, Address(y, idx, Address::times_4, 0)); 6826 rorxq(yz_idx2, yz_idx2, 32); 6827 6828 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 6829 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp 6830 6831 movq(yz_idx1, Address(z, idx, Address::times_4, 8)); 6832 rorxq(yz_idx1, yz_idx1, 32); 6833 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 6834 rorxq(yz_idx2, yz_idx2, 32); 6835 6836 if (VM_Version::supports_adx()) { 6837 adcxq(tmp3, carry); 6838 adoxq(tmp3, yz_idx1); 6839 6840 adcxq(tmp4, tmp); 6841 adoxq(tmp4, yz_idx2); 6842 6843 movl(carry, 0); // does not affect flags 6844 adcxq(carry2, carry); 6845 adoxq(carry2, carry); 6846 } else { 6847 add2_with_carry(tmp4, tmp3, carry, yz_idx1); 6848 add2_with_carry(carry2, tmp4, tmp, yz_idx2); 6849 } 6850 movq(carry, carry2); 6851 6852 movl(Address(z, idx, Address::times_4, 12), tmp3); 6853 shrq(tmp3, 32); 6854 movl(Address(z, idx, Address::times_4, 8), tmp3); 6855 6856 movl(Address(z, idx, Address::times_4, 4), tmp4); 6857 shrq(tmp4, 32); 6858 movl(Address(z, idx, Address::times_4, 0), tmp4); 6859 6860 jmp(L_third_loop); 6861 6862 bind (L_third_loop_exit); 6863 6864 andl (idx, 0x3); 6865 jcc(Assembler::zero, L_post_third_loop_done); 6866 6867 Label L_check_1; 6868 subl(idx, 2); 6869 jcc(Assembler::negative, L_check_1); 6870 6871 movq(yz_idx1, Address(y, idx, Address::times_4, 0)); 6872 rorxq(yz_idx1, yz_idx1, 32); 6873 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 6874 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 6875 rorxq(yz_idx2, yz_idx2, 32); 6876 6877 add2_with_carry(tmp4, tmp3, carry, yz_idx2); 6878 6879 movl(Address(z, idx, Address::times_4, 4), tmp3); 6880 shrq(tmp3, 32); 6881 movl(Address(z, idx, Address::times_4, 0), tmp3); 6882 movq(carry, tmp4); 6883 6884 bind (L_check_1); 6885 addl (idx, 0x2); 6886 andl (idx, 0x1); 6887 subl(idx, 1); 6888 jcc(Assembler::negative, L_post_third_loop_done); 6889 movl(tmp4, Address(y, idx, Address::times_4, 0)); 6890 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 6891 movl(tmp4, Address(z, idx, Address::times_4, 0)); 6892 6893 add2_with_carry(carry2, tmp3, tmp4, carry); 6894 6895 movl(Address(z, idx, Address::times_4, 0), tmp3); 6896 shrq(tmp3, 32); 6897 6898 shlq(carry2, 32); 6899 orq(tmp3, carry2); 6900 movq(carry, tmp3); 6901 6902 bind(L_post_third_loop_done); 6903 } 6904 6905 /** 6906 * Code for BigInteger::multiplyToLen() instrinsic. 6907 * 6908 * rdi: x 6909 * rax: xlen 6910 * rsi: y 6911 * rcx: ylen 6912 * r8: z 6913 * r11: zlen 6914 * r12: tmp1 6915 * r13: tmp2 6916 * r14: tmp3 6917 * r15: tmp4 6918 * rbx: tmp5 6919 * 6920 */ 6921 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, 6922 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 6923 ShortBranchVerifier sbv(this); 6924 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); 6925 6926 push(tmp1); 6927 push(tmp2); 6928 push(tmp3); 6929 push(tmp4); 6930 push(tmp5); 6931 6932 push(xlen); 6933 push(zlen); 6934 6935 const Register idx = tmp1; 6936 const Register kdx = tmp2; 6937 const Register xstart = tmp3; 6938 6939 const Register y_idx = tmp4; 6940 const Register carry = tmp5; 6941 const Register product = xlen; 6942 const Register x_xstart = zlen; // reuse register 6943 6944 // First Loop. 6945 // 6946 // final static long LONG_MASK = 0xffffffffL; 6947 // int xstart = xlen - 1; 6948 // int ystart = ylen - 1; 6949 // long carry = 0; 6950 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 6951 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 6952 // z[kdx] = (int)product; 6953 // carry = product >>> 32; 6954 // } 6955 // z[xstart] = (int)carry; 6956 // 6957 6958 movl(idx, ylen); // idx = ylen; 6959 movl(kdx, zlen); // kdx = xlen+ylen; 6960 xorq(carry, carry); // carry = 0; 6961 6962 Label L_done; 6963 6964 movl(xstart, xlen); 6965 decrementl(xstart); 6966 jcc(Assembler::negative, L_done); 6967 6968 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 6969 6970 Label L_second_loop; 6971 testl(kdx, kdx); 6972 jcc(Assembler::zero, L_second_loop); 6973 6974 Label L_carry; 6975 subl(kdx, 1); 6976 jcc(Assembler::zero, L_carry); 6977 6978 movl(Address(z, kdx, Address::times_4, 0), carry); 6979 shrq(carry, 32); 6980 subl(kdx, 1); 6981 6982 bind(L_carry); 6983 movl(Address(z, kdx, Address::times_4, 0), carry); 6984 6985 // Second and third (nested) loops. 6986 // 6987 // for (int i = xstart-1; i >= 0; i--) { // Second loop 6988 // carry = 0; 6989 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 6990 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 6991 // (z[k] & LONG_MASK) + carry; 6992 // z[k] = (int)product; 6993 // carry = product >>> 32; 6994 // } 6995 // z[i] = (int)carry; 6996 // } 6997 // 6998 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 6999 7000 const Register jdx = tmp1; 7001 7002 bind(L_second_loop); 7003 xorl(carry, carry); // carry = 0; 7004 movl(jdx, ylen); // j = ystart+1 7005 7006 subl(xstart, 1); // i = xstart-1; 7007 jcc(Assembler::negative, L_done); 7008 7009 push (z); 7010 7011 Label L_last_x; 7012 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j 7013 subl(xstart, 1); // i = xstart-1; 7014 jcc(Assembler::negative, L_last_x); 7015 7016 if (UseBMI2Instructions) { 7017 movq(rdx, Address(x, xstart, Address::times_4, 0)); 7018 rorxq(rdx, rdx, 32); // convert big-endian to little-endian 7019 } else { 7020 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 7021 rorq(x_xstart, 32); // convert big-endian to little-endian 7022 } 7023 7024 Label L_third_loop_prologue; 7025 bind(L_third_loop_prologue); 7026 7027 push (x); 7028 push (xstart); 7029 push (ylen); 7030 7031 7032 if (UseBMI2Instructions) { 7033 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); 7034 } else { // !UseBMI2Instructions 7035 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); 7036 } 7037 7038 pop(ylen); 7039 pop(xlen); 7040 pop(x); 7041 pop(z); 7042 7043 movl(tmp3, xlen); 7044 addl(tmp3, 1); 7045 movl(Address(z, tmp3, Address::times_4, 0), carry); 7046 subl(tmp3, 1); 7047 jccb(Assembler::negative, L_done); 7048 7049 shrq(carry, 32); 7050 movl(Address(z, tmp3, Address::times_4, 0), carry); 7051 jmp(L_second_loop); 7052 7053 // Next infrequent code is moved outside loops. 7054 bind(L_last_x); 7055 if (UseBMI2Instructions) { 7056 movl(rdx, Address(x, 0)); 7057 } else { 7058 movl(x_xstart, Address(x, 0)); 7059 } 7060 jmp(L_third_loop_prologue); 7061 7062 bind(L_done); 7063 7064 pop(zlen); 7065 pop(xlen); 7066 7067 pop(tmp5); 7068 pop(tmp4); 7069 pop(tmp3); 7070 pop(tmp2); 7071 pop(tmp1); 7072 } 7073 7074 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale, 7075 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){ 7076 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled."); 7077 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP; 7078 Label VECTOR8_TAIL, VECTOR4_TAIL; 7079 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL; 7080 Label SAME_TILL_END, DONE; 7081 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL; 7082 7083 //scale is in rcx in both Win64 and Unix 7084 ShortBranchVerifier sbv(this); 7085 7086 shlq(length); 7087 xorq(result, result); 7088 7089 if ((AVX3Threshold == 0) && (UseAVX > 2) && 7090 VM_Version::supports_avx512vlbw()) { 7091 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL; 7092 7093 cmpq(length, 64); 7094 jcc(Assembler::less, VECTOR32_TAIL); 7095 7096 movq(tmp1, length); 7097 andq(tmp1, 0x3F); // tail count 7098 andq(length, ~(0x3F)); //vector count 7099 7100 bind(VECTOR64_LOOP); 7101 // AVX512 code to compare 64 byte vectors. 7102 evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit); 7103 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit); 7104 kortestql(k7, k7); 7105 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch 7106 addq(result, 64); 7107 subq(length, 64); 7108 jccb(Assembler::notZero, VECTOR64_LOOP); 7109 7110 //bind(VECTOR64_TAIL); 7111 testq(tmp1, tmp1); 7112 jcc(Assembler::zero, SAME_TILL_END); 7113 7114 //bind(VECTOR64_TAIL); 7115 // AVX512 code to compare upto 63 byte vectors. 7116 mov64(tmp2, 0xFFFFFFFFFFFFFFFF); 7117 shlxq(tmp2, tmp2, tmp1); 7118 notq(tmp2); 7119 kmovql(k3, tmp2); 7120 7121 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit); 7122 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit); 7123 7124 ktestql(k7, k3); 7125 jcc(Assembler::below, SAME_TILL_END); // not mismatch 7126 7127 bind(VECTOR64_NOT_EQUAL); 7128 kmovql(tmp1, k7); 7129 notq(tmp1); 7130 tzcntq(tmp1, tmp1); 7131 addq(result, tmp1); 7132 shrq(result); 7133 jmp(DONE); 7134 bind(VECTOR32_TAIL); 7135 } 7136 7137 cmpq(length, 8); 7138 jcc(Assembler::equal, VECTOR8_LOOP); 7139 jcc(Assembler::less, VECTOR4_TAIL); 7140 7141 if (UseAVX >= 2) { 7142 Label VECTOR16_TAIL, VECTOR32_LOOP; 7143 7144 cmpq(length, 16); 7145 jcc(Assembler::equal, VECTOR16_LOOP); 7146 jcc(Assembler::less, VECTOR8_LOOP); 7147 7148 cmpq(length, 32); 7149 jccb(Assembler::less, VECTOR16_TAIL); 7150 7151 subq(length, 32); 7152 bind(VECTOR32_LOOP); 7153 vmovdqu(rymm0, Address(obja, result)); 7154 vmovdqu(rymm1, Address(objb, result)); 7155 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit); 7156 vptest(rymm2, rymm2); 7157 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found 7158 addq(result, 32); 7159 subq(length, 32); 7160 jcc(Assembler::greaterEqual, VECTOR32_LOOP); 7161 addq(length, 32); 7162 jcc(Assembler::equal, SAME_TILL_END); 7163 //falling through if less than 32 bytes left //close the branch here. 7164 7165 bind(VECTOR16_TAIL); 7166 cmpq(length, 16); 7167 jccb(Assembler::less, VECTOR8_TAIL); 7168 bind(VECTOR16_LOOP); 7169 movdqu(rymm0, Address(obja, result)); 7170 movdqu(rymm1, Address(objb, result)); 7171 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit); 7172 ptest(rymm2, rymm2); 7173 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 7174 addq(result, 16); 7175 subq(length, 16); 7176 jcc(Assembler::equal, SAME_TILL_END); 7177 //falling through if less than 16 bytes left 7178 } else {//regular intrinsics 7179 7180 cmpq(length, 16); 7181 jccb(Assembler::less, VECTOR8_TAIL); 7182 7183 subq(length, 16); 7184 bind(VECTOR16_LOOP); 7185 movdqu(rymm0, Address(obja, result)); 7186 movdqu(rymm1, Address(objb, result)); 7187 pxor(rymm0, rymm1); 7188 ptest(rymm0, rymm0); 7189 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 7190 addq(result, 16); 7191 subq(length, 16); 7192 jccb(Assembler::greaterEqual, VECTOR16_LOOP); 7193 addq(length, 16); 7194 jcc(Assembler::equal, SAME_TILL_END); 7195 //falling through if less than 16 bytes left 7196 } 7197 7198 bind(VECTOR8_TAIL); 7199 cmpq(length, 8); 7200 jccb(Assembler::less, VECTOR4_TAIL); 7201 bind(VECTOR8_LOOP); 7202 movq(tmp1, Address(obja, result)); 7203 movq(tmp2, Address(objb, result)); 7204 xorq(tmp1, tmp2); 7205 testq(tmp1, tmp1); 7206 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found 7207 addq(result, 8); 7208 subq(length, 8); 7209 jcc(Assembler::equal, SAME_TILL_END); 7210 //falling through if less than 8 bytes left 7211 7212 bind(VECTOR4_TAIL); 7213 cmpq(length, 4); 7214 jccb(Assembler::less, BYTES_TAIL); 7215 bind(VECTOR4_LOOP); 7216 movl(tmp1, Address(obja, result)); 7217 xorl(tmp1, Address(objb, result)); 7218 testl(tmp1, tmp1); 7219 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found 7220 addq(result, 4); 7221 subq(length, 4); 7222 jcc(Assembler::equal, SAME_TILL_END); 7223 //falling through if less than 4 bytes left 7224 7225 bind(BYTES_TAIL); 7226 bind(BYTES_LOOP); 7227 load_unsigned_byte(tmp1, Address(obja, result)); 7228 load_unsigned_byte(tmp2, Address(objb, result)); 7229 xorl(tmp1, tmp2); 7230 testl(tmp1, tmp1); 7231 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 7232 decq(length); 7233 jcc(Assembler::zero, SAME_TILL_END); 7234 incq(result); 7235 load_unsigned_byte(tmp1, Address(obja, result)); 7236 load_unsigned_byte(tmp2, Address(objb, result)); 7237 xorl(tmp1, tmp2); 7238 testl(tmp1, tmp1); 7239 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 7240 decq(length); 7241 jcc(Assembler::zero, SAME_TILL_END); 7242 incq(result); 7243 load_unsigned_byte(tmp1, Address(obja, result)); 7244 load_unsigned_byte(tmp2, Address(objb, result)); 7245 xorl(tmp1, tmp2); 7246 testl(tmp1, tmp1); 7247 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 7248 jmp(SAME_TILL_END); 7249 7250 if (UseAVX >= 2) { 7251 bind(VECTOR32_NOT_EQUAL); 7252 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit); 7253 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit); 7254 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit); 7255 vpmovmskb(tmp1, rymm0); 7256 bsfq(tmp1, tmp1); 7257 addq(result, tmp1); 7258 shrq(result); 7259 jmp(DONE); 7260 } 7261 7262 bind(VECTOR16_NOT_EQUAL); 7263 if (UseAVX >= 2) { 7264 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit); 7265 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit); 7266 pxor(rymm0, rymm2); 7267 } else { 7268 pcmpeqb(rymm2, rymm2); 7269 pxor(rymm0, rymm1); 7270 pcmpeqb(rymm0, rymm1); 7271 pxor(rymm0, rymm2); 7272 } 7273 pmovmskb(tmp1, rymm0); 7274 bsfq(tmp1, tmp1); 7275 addq(result, tmp1); 7276 shrq(result); 7277 jmpb(DONE); 7278 7279 bind(VECTOR8_NOT_EQUAL); 7280 bind(VECTOR4_NOT_EQUAL); 7281 bsfq(tmp1, tmp1); 7282 shrq(tmp1, 3); 7283 addq(result, tmp1); 7284 bind(BYTES_NOT_EQUAL); 7285 shrq(result); 7286 jmpb(DONE); 7287 7288 bind(SAME_TILL_END); 7289 mov64(result, -1); 7290 7291 bind(DONE); 7292 } 7293 7294 //Helper functions for square_to_len() 7295 7296 /** 7297 * Store the squares of x[], right shifted one bit (divided by 2) into z[] 7298 * Preserves x and z and modifies rest of the registers. 7299 */ 7300 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7301 // Perform square and right shift by 1 7302 // Handle odd xlen case first, then for even xlen do the following 7303 // jlong carry = 0; 7304 // for (int j=0, i=0; j < xlen; j+=2, i+=4) { 7305 // huge_128 product = x[j:j+1] * x[j:j+1]; 7306 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); 7307 // z[i+2:i+3] = (jlong)(product >>> 1); 7308 // carry = (jlong)product; 7309 // } 7310 7311 xorq(tmp5, tmp5); // carry 7312 xorq(rdxReg, rdxReg); 7313 xorl(tmp1, tmp1); // index for x 7314 xorl(tmp4, tmp4); // index for z 7315 7316 Label L_first_loop, L_first_loop_exit; 7317 7318 testl(xlen, 1); 7319 jccb(Assembler::zero, L_first_loop); //jump if xlen is even 7320 7321 // Square and right shift by 1 the odd element using 32 bit multiply 7322 movl(raxReg, Address(x, tmp1, Address::times_4, 0)); 7323 imulq(raxReg, raxReg); 7324 shrq(raxReg, 1); 7325 adcq(tmp5, 0); 7326 movq(Address(z, tmp4, Address::times_4, 0), raxReg); 7327 incrementl(tmp1); 7328 addl(tmp4, 2); 7329 7330 // Square and right shift by 1 the rest using 64 bit multiply 7331 bind(L_first_loop); 7332 cmpptr(tmp1, xlen); 7333 jccb(Assembler::equal, L_first_loop_exit); 7334 7335 // Square 7336 movq(raxReg, Address(x, tmp1, Address::times_4, 0)); 7337 rorq(raxReg, 32); // convert big-endian to little-endian 7338 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax 7339 7340 // Right shift by 1 and save carry 7341 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 7342 rcrq(rdxReg, 1); 7343 rcrq(raxReg, 1); 7344 adcq(tmp5, 0); 7345 7346 // Store result in z 7347 movq(Address(z, tmp4, Address::times_4, 0), rdxReg); 7348 movq(Address(z, tmp4, Address::times_4, 8), raxReg); 7349 7350 // Update indices for x and z 7351 addl(tmp1, 2); 7352 addl(tmp4, 4); 7353 jmp(L_first_loop); 7354 7355 bind(L_first_loop_exit); 7356 } 7357 7358 7359 /** 7360 * Perform the following multiply add operation using BMI2 instructions 7361 * carry:sum = sum + op1*op2 + carry 7362 * op2 should be in rdx 7363 * op2 is preserved, all other registers are modified 7364 */ 7365 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { 7366 // assert op2 is rdx 7367 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 7368 addq(sum, carry); 7369 adcq(tmp2, 0); 7370 addq(sum, op1); 7371 adcq(tmp2, 0); 7372 movq(carry, tmp2); 7373 } 7374 7375 /** 7376 * Perform the following multiply add operation: 7377 * carry:sum = sum + op1*op2 + carry 7378 * Preserves op1, op2 and modifies rest of registers 7379 */ 7380 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { 7381 // rdx:rax = op1 * op2 7382 movq(raxReg, op2); 7383 mulq(op1); 7384 7385 // rdx:rax = sum + carry + rdx:rax 7386 addq(sum, carry); 7387 adcq(rdxReg, 0); 7388 addq(sum, raxReg); 7389 adcq(rdxReg, 0); 7390 7391 // carry:sum = rdx:sum 7392 movq(carry, rdxReg); 7393 } 7394 7395 /** 7396 * Add 64 bit long carry into z[] with carry propogation. 7397 * Preserves z and carry register values and modifies rest of registers. 7398 * 7399 */ 7400 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { 7401 Label L_fourth_loop, L_fourth_loop_exit; 7402 7403 movl(tmp1, 1); 7404 subl(zlen, 2); 7405 addq(Address(z, zlen, Address::times_4, 0), carry); 7406 7407 bind(L_fourth_loop); 7408 jccb(Assembler::carryClear, L_fourth_loop_exit); 7409 subl(zlen, 2); 7410 jccb(Assembler::negative, L_fourth_loop_exit); 7411 addq(Address(z, zlen, Address::times_4, 0), tmp1); 7412 jmp(L_fourth_loop); 7413 bind(L_fourth_loop_exit); 7414 } 7415 7416 /** 7417 * Shift z[] left by 1 bit. 7418 * Preserves x, len, z and zlen registers and modifies rest of the registers. 7419 * 7420 */ 7421 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 7422 7423 Label L_fifth_loop, L_fifth_loop_exit; 7424 7425 // Fifth loop 7426 // Perform primitiveLeftShift(z, zlen, 1) 7427 7428 const Register prev_carry = tmp1; 7429 const Register new_carry = tmp4; 7430 const Register value = tmp2; 7431 const Register zidx = tmp3; 7432 7433 // int zidx, carry; 7434 // long value; 7435 // carry = 0; 7436 // for (zidx = zlen-2; zidx >=0; zidx -= 2) { 7437 // (carry:value) = (z[i] << 1) | carry ; 7438 // z[i] = value; 7439 // } 7440 7441 movl(zidx, zlen); 7442 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register 7443 7444 bind(L_fifth_loop); 7445 decl(zidx); // Use decl to preserve carry flag 7446 decl(zidx); 7447 jccb(Assembler::negative, L_fifth_loop_exit); 7448 7449 if (UseBMI2Instructions) { 7450 movq(value, Address(z, zidx, Address::times_4, 0)); 7451 rclq(value, 1); 7452 rorxq(value, value, 32); 7453 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7454 } 7455 else { 7456 // clear new_carry 7457 xorl(new_carry, new_carry); 7458 7459 // Shift z[i] by 1, or in previous carry and save new carry 7460 movq(value, Address(z, zidx, Address::times_4, 0)); 7461 shlq(value, 1); 7462 adcl(new_carry, 0); 7463 7464 orq(value, prev_carry); 7465 rorq(value, 0x20); 7466 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7467 7468 // Set previous carry = new carry 7469 movl(prev_carry, new_carry); 7470 } 7471 jmp(L_fifth_loop); 7472 7473 bind(L_fifth_loop_exit); 7474 } 7475 7476 7477 /** 7478 * Code for BigInteger::squareToLen() intrinsic 7479 * 7480 * rdi: x 7481 * rsi: len 7482 * r8: z 7483 * rcx: zlen 7484 * r12: tmp1 7485 * r13: tmp2 7486 * r14: tmp3 7487 * r15: tmp4 7488 * rbx: tmp5 7489 * 7490 */ 7491 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7492 7493 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply; 7494 push(tmp1); 7495 push(tmp2); 7496 push(tmp3); 7497 push(tmp4); 7498 push(tmp5); 7499 7500 // First loop 7501 // Store the squares, right shifted one bit (i.e., divided by 2). 7502 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); 7503 7504 // Add in off-diagonal sums. 7505 // 7506 // Second, third (nested) and fourth loops. 7507 // zlen +=2; 7508 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { 7509 // carry = 0; 7510 // long op2 = x[xidx:xidx+1]; 7511 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { 7512 // k -= 2; 7513 // long op1 = x[j:j+1]; 7514 // long sum = z[k:k+1]; 7515 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); 7516 // z[k:k+1] = sum; 7517 // } 7518 // add_one_64(z, k, carry, tmp_regs); 7519 // } 7520 7521 const Register carry = tmp5; 7522 const Register sum = tmp3; 7523 const Register op1 = tmp4; 7524 Register op2 = tmp2; 7525 7526 push(zlen); 7527 push(len); 7528 addl(zlen,2); 7529 bind(L_second_loop); 7530 xorq(carry, carry); 7531 subl(zlen, 4); 7532 subl(len, 2); 7533 push(zlen); 7534 push(len); 7535 cmpl(len, 0); 7536 jccb(Assembler::lessEqual, L_second_loop_exit); 7537 7538 // Multiply an array by one 64 bit long. 7539 if (UseBMI2Instructions) { 7540 op2 = rdxReg; 7541 movq(op2, Address(x, len, Address::times_4, 0)); 7542 rorxq(op2, op2, 32); 7543 } 7544 else { 7545 movq(op2, Address(x, len, Address::times_4, 0)); 7546 rorq(op2, 32); 7547 } 7548 7549 bind(L_third_loop); 7550 decrementl(len); 7551 jccb(Assembler::negative, L_third_loop_exit); 7552 decrementl(len); 7553 jccb(Assembler::negative, L_last_x); 7554 7555 movq(op1, Address(x, len, Address::times_4, 0)); 7556 rorq(op1, 32); 7557 7558 bind(L_multiply); 7559 subl(zlen, 2); 7560 movq(sum, Address(z, zlen, Address::times_4, 0)); 7561 7562 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. 7563 if (UseBMI2Instructions) { 7564 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); 7565 } 7566 else { 7567 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 7568 } 7569 7570 movq(Address(z, zlen, Address::times_4, 0), sum); 7571 7572 jmp(L_third_loop); 7573 bind(L_third_loop_exit); 7574 7575 // Fourth loop 7576 // Add 64 bit long carry into z with carry propogation. 7577 // Uses offsetted zlen. 7578 add_one_64(z, zlen, carry, tmp1); 7579 7580 pop(len); 7581 pop(zlen); 7582 jmp(L_second_loop); 7583 7584 // Next infrequent code is moved outside loops. 7585 bind(L_last_x); 7586 movl(op1, Address(x, 0)); 7587 jmp(L_multiply); 7588 7589 bind(L_second_loop_exit); 7590 pop(len); 7591 pop(zlen); 7592 pop(len); 7593 pop(zlen); 7594 7595 // Fifth loop 7596 // Shift z left 1 bit. 7597 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); 7598 7599 // z[zlen-1] |= x[len-1] & 1; 7600 movl(tmp3, Address(x, len, Address::times_4, -4)); 7601 andl(tmp3, 1); 7602 orl(Address(z, zlen, Address::times_4, -4), tmp3); 7603 7604 pop(tmp5); 7605 pop(tmp4); 7606 pop(tmp3); 7607 pop(tmp2); 7608 pop(tmp1); 7609 } 7610 7611 /** 7612 * Helper function for mul_add() 7613 * Multiply the in[] by int k and add to out[] starting at offset offs using 7614 * 128 bit by 32 bit multiply and return the carry in tmp5. 7615 * Only quad int aligned length of in[] is operated on in this function. 7616 * k is in rdxReg for BMI2Instructions, for others it is in tmp2. 7617 * This function preserves out, in and k registers. 7618 * len and offset point to the appropriate index in "in" & "out" correspondingly 7619 * tmp5 has the carry. 7620 * other registers are temporary and are modified. 7621 * 7622 */ 7623 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 7624 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, 7625 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7626 7627 Label L_first_loop, L_first_loop_exit; 7628 7629 movl(tmp1, len); 7630 shrl(tmp1, 2); 7631 7632 bind(L_first_loop); 7633 subl(tmp1, 1); 7634 jccb(Assembler::negative, L_first_loop_exit); 7635 7636 subl(len, 4); 7637 subl(offset, 4); 7638 7639 Register op2 = tmp2; 7640 const Register sum = tmp3; 7641 const Register op1 = tmp4; 7642 const Register carry = tmp5; 7643 7644 if (UseBMI2Instructions) { 7645 op2 = rdxReg; 7646 } 7647 7648 movq(op1, Address(in, len, Address::times_4, 8)); 7649 rorq(op1, 32); 7650 movq(sum, Address(out, offset, Address::times_4, 8)); 7651 rorq(sum, 32); 7652 if (UseBMI2Instructions) { 7653 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 7654 } 7655 else { 7656 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 7657 } 7658 // Store back in big endian from little endian 7659 rorq(sum, 0x20); 7660 movq(Address(out, offset, Address::times_4, 8), sum); 7661 7662 movq(op1, Address(in, len, Address::times_4, 0)); 7663 rorq(op1, 32); 7664 movq(sum, Address(out, offset, Address::times_4, 0)); 7665 rorq(sum, 32); 7666 if (UseBMI2Instructions) { 7667 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 7668 } 7669 else { 7670 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 7671 } 7672 // Store back in big endian from little endian 7673 rorq(sum, 0x20); 7674 movq(Address(out, offset, Address::times_4, 0), sum); 7675 7676 jmp(L_first_loop); 7677 bind(L_first_loop_exit); 7678 } 7679 7680 /** 7681 * Code for BigInteger::mulAdd() intrinsic 7682 * 7683 * rdi: out 7684 * rsi: in 7685 * r11: offs (out.length - offset) 7686 * rcx: len 7687 * r8: k 7688 * r12: tmp1 7689 * r13: tmp2 7690 * r14: tmp3 7691 * r15: tmp4 7692 * rbx: tmp5 7693 * Multiply the in[] by word k and add to out[], return the carry in rax 7694 */ 7695 void MacroAssembler::mul_add(Register out, Register in, Register offs, 7696 Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 7697 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7698 7699 Label L_carry, L_last_in, L_done; 7700 7701 // carry = 0; 7702 // for (int j=len-1; j >= 0; j--) { 7703 // long product = (in[j] & LONG_MASK) * kLong + 7704 // (out[offs] & LONG_MASK) + carry; 7705 // out[offs--] = (int)product; 7706 // carry = product >>> 32; 7707 // } 7708 // 7709 push(tmp1); 7710 push(tmp2); 7711 push(tmp3); 7712 push(tmp4); 7713 push(tmp5); 7714 7715 Register op2 = tmp2; 7716 const Register sum = tmp3; 7717 const Register op1 = tmp4; 7718 const Register carry = tmp5; 7719 7720 if (UseBMI2Instructions) { 7721 op2 = rdxReg; 7722 movl(op2, k); 7723 } 7724 else { 7725 movl(op2, k); 7726 } 7727 7728 xorq(carry, carry); 7729 7730 //First loop 7731 7732 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply 7733 //The carry is in tmp5 7734 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); 7735 7736 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any 7737 decrementl(len); 7738 jccb(Assembler::negative, L_carry); 7739 decrementl(len); 7740 jccb(Assembler::negative, L_last_in); 7741 7742 movq(op1, Address(in, len, Address::times_4, 0)); 7743 rorq(op1, 32); 7744 7745 subl(offs, 2); 7746 movq(sum, Address(out, offs, Address::times_4, 0)); 7747 rorq(sum, 32); 7748 7749 if (UseBMI2Instructions) { 7750 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 7751 } 7752 else { 7753 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 7754 } 7755 7756 // Store back in big endian from little endian 7757 rorq(sum, 0x20); 7758 movq(Address(out, offs, Address::times_4, 0), sum); 7759 7760 testl(len, len); 7761 jccb(Assembler::zero, L_carry); 7762 7763 //Multiply the last in[] entry, if any 7764 bind(L_last_in); 7765 movl(op1, Address(in, 0)); 7766 movl(sum, Address(out, offs, Address::times_4, -4)); 7767 7768 movl(raxReg, k); 7769 mull(op1); //tmp4 * eax -> edx:eax 7770 addl(sum, carry); 7771 adcl(rdxReg, 0); 7772 addl(sum, raxReg); 7773 adcl(rdxReg, 0); 7774 movl(carry, rdxReg); 7775 7776 movl(Address(out, offs, Address::times_4, -4), sum); 7777 7778 bind(L_carry); 7779 //return tmp5/carry as carry in rax 7780 movl(rax, carry); 7781 7782 bind(L_done); 7783 pop(tmp5); 7784 pop(tmp4); 7785 pop(tmp3); 7786 pop(tmp2); 7787 pop(tmp1); 7788 } 7789 #endif 7790 7791 /** 7792 * Emits code to update CRC-32 with a byte value according to constants in table 7793 * 7794 * @param [in,out]crc Register containing the crc. 7795 * @param [in]val Register containing the byte to fold into the CRC. 7796 * @param [in]table Register containing the table of crc constants. 7797 * 7798 * uint32_t crc; 7799 * val = crc_table[(val ^ crc) & 0xFF]; 7800 * crc = val ^ (crc >> 8); 7801 * 7802 */ 7803 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 7804 xorl(val, crc); 7805 andl(val, 0xFF); 7806 shrl(crc, 8); // unsigned shift 7807 xorl(crc, Address(table, val, Address::times_4, 0)); 7808 } 7809 7810 /** 7811 * Fold 128-bit data chunk 7812 */ 7813 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { 7814 if (UseAVX > 0) { 7815 vpclmulhdq(xtmp, xK, xcrc); // [123:64] 7816 vpclmulldq(xcrc, xK, xcrc); // [63:0] 7817 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */); 7818 pxor(xcrc, xtmp); 7819 } else { 7820 movdqa(xtmp, xcrc); 7821 pclmulhdq(xtmp, xK); // [123:64] 7822 pclmulldq(xcrc, xK); // [63:0] 7823 pxor(xcrc, xtmp); 7824 movdqu(xtmp, Address(buf, offset)); 7825 pxor(xcrc, xtmp); 7826 } 7827 } 7828 7829 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { 7830 if (UseAVX > 0) { 7831 vpclmulhdq(xtmp, xK, xcrc); 7832 vpclmulldq(xcrc, xK, xcrc); 7833 pxor(xcrc, xbuf); 7834 pxor(xcrc, xtmp); 7835 } else { 7836 movdqa(xtmp, xcrc); 7837 pclmulhdq(xtmp, xK); 7838 pclmulldq(xcrc, xK); 7839 pxor(xcrc, xbuf); 7840 pxor(xcrc, xtmp); 7841 } 7842 } 7843 7844 /** 7845 * 8-bit folds to compute 32-bit CRC 7846 * 7847 * uint64_t xcrc; 7848 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); 7849 */ 7850 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { 7851 movdl(tmp, xcrc); 7852 andl(tmp, 0xFF); 7853 movdl(xtmp, Address(table, tmp, Address::times_4, 0)); 7854 psrldq(xcrc, 1); // unsigned shift one byte 7855 pxor(xcrc, xtmp); 7856 } 7857 7858 /** 7859 * uint32_t crc; 7860 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 7861 */ 7862 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 7863 movl(tmp, crc); 7864 andl(tmp, 0xFF); 7865 shrl(crc, 8); 7866 xorl(crc, Address(table, tmp, Address::times_4, 0)); 7867 } 7868 7869 /** 7870 * @param crc register containing existing CRC (32-bit) 7871 * @param buf register pointing to input byte buffer (byte*) 7872 * @param len register containing number of bytes 7873 * @param table register that will contain address of CRC table 7874 * @param tmp scratch register 7875 */ 7876 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { 7877 assert_different_registers(crc, buf, len, table, tmp, rax); 7878 7879 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 7880 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 7881 7882 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 7883 // context for the registers used, where all instructions below are using 128-bit mode 7884 // On EVEX without VL and BW, these instructions will all be AVX. 7885 lea(table, ExternalAddress(StubRoutines::crc_table_addr())); 7886 notl(crc); // ~crc 7887 cmpl(len, 16); 7888 jcc(Assembler::less, L_tail); 7889 7890 // Align buffer to 16 bytes 7891 movl(tmp, buf); 7892 andl(tmp, 0xF); 7893 jccb(Assembler::zero, L_aligned); 7894 subl(tmp, 16); 7895 addl(len, tmp); 7896 7897 align(4); 7898 BIND(L_align_loop); 7899 movsbl(rax, Address(buf, 0)); // load byte with sign extension 7900 update_byte_crc32(crc, rax, table); 7901 increment(buf); 7902 incrementl(tmp); 7903 jccb(Assembler::less, L_align_loop); 7904 7905 BIND(L_aligned); 7906 movl(tmp, len); // save 7907 shrl(len, 4); 7908 jcc(Assembler::zero, L_tail_restore); 7909 7910 // Fold crc into first bytes of vector 7911 movdqa(xmm1, Address(buf, 0)); 7912 movdl(rax, xmm1); 7913 xorl(crc, rax); 7914 if (VM_Version::supports_sse4_1()) { 7915 pinsrd(xmm1, crc, 0); 7916 } else { 7917 pinsrw(xmm1, crc, 0); 7918 shrl(crc, 16); 7919 pinsrw(xmm1, crc, 1); 7920 } 7921 addptr(buf, 16); 7922 subl(len, 4); // len > 0 7923 jcc(Assembler::less, L_fold_tail); 7924 7925 movdqa(xmm2, Address(buf, 0)); 7926 movdqa(xmm3, Address(buf, 16)); 7927 movdqa(xmm4, Address(buf, 32)); 7928 addptr(buf, 48); 7929 subl(len, 3); 7930 jcc(Assembler::lessEqual, L_fold_512b); 7931 7932 // Fold total 512 bits of polynomial on each iteration, 7933 // 128 bits per each of 4 parallel streams. 7934 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32)); 7935 7936 align32(); 7937 BIND(L_fold_512b_loop); 7938 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 7939 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); 7940 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); 7941 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); 7942 addptr(buf, 64); 7943 subl(len, 4); 7944 jcc(Assembler::greater, L_fold_512b_loop); 7945 7946 // Fold 512 bits to 128 bits. 7947 BIND(L_fold_512b); 7948 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); 7949 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); 7950 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); 7951 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); 7952 7953 // Fold the rest of 128 bits data chunks 7954 BIND(L_fold_tail); 7955 addl(len, 3); 7956 jccb(Assembler::lessEqual, L_fold_128b); 7957 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); 7958 7959 BIND(L_fold_tail_loop); 7960 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 7961 addptr(buf, 16); 7962 decrementl(len); 7963 jccb(Assembler::greater, L_fold_tail_loop); 7964 7965 // Fold 128 bits in xmm1 down into 32 bits in crc register. 7966 BIND(L_fold_128b); 7967 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); 7968 if (UseAVX > 0) { 7969 vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 7970 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */); 7971 vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 7972 } else { 7973 movdqa(xmm2, xmm0); 7974 pclmulqdq(xmm2, xmm1, 0x1); 7975 movdqa(xmm3, xmm0); 7976 pand(xmm3, xmm2); 7977 pclmulqdq(xmm0, xmm3, 0x1); 7978 } 7979 psrldq(xmm1, 8); 7980 psrldq(xmm2, 4); 7981 pxor(xmm0, xmm1); 7982 pxor(xmm0, xmm2); 7983 7984 // 8 8-bit folds to compute 32-bit CRC. 7985 for (int j = 0; j < 4; j++) { 7986 fold_8bit_crc32(xmm0, table, xmm1, rax); 7987 } 7988 movdl(crc, xmm0); // mov 32 bits to general register 7989 for (int j = 0; j < 4; j++) { 7990 fold_8bit_crc32(crc, table, rax); 7991 } 7992 7993 BIND(L_tail_restore); 7994 movl(len, tmp); // restore 7995 BIND(L_tail); 7996 andl(len, 0xf); 7997 jccb(Assembler::zero, L_exit); 7998 7999 // Fold the rest of bytes 8000 align(4); 8001 BIND(L_tail_loop); 8002 movsbl(rax, Address(buf, 0)); // load byte with sign extension 8003 update_byte_crc32(crc, rax, table); 8004 increment(buf); 8005 decrementl(len); 8006 jccb(Assembler::greater, L_tail_loop); 8007 8008 BIND(L_exit); 8009 notl(crc); // ~c 8010 } 8011 8012 #ifdef _LP64 8013 // Helper function for AVX 512 CRC32 8014 // Fold 512-bit data chunks 8015 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, 8016 Register pos, int offset) { 8017 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit); 8018 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64] 8019 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0] 8020 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */); 8021 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */); 8022 } 8023 8024 // Helper function for AVX 512 CRC32 8025 // Compute CRC32 for < 256B buffers 8026 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos, 8027 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop, 8028 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) { 8029 8030 Label L_less_than_32, L_exact_16_left, L_less_than_16_left; 8031 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left; 8032 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2; 8033 8034 // check if there is enough buffer to be able to fold 16B at a time 8035 cmpl(len, 32); 8036 jcc(Assembler::less, L_less_than_32); 8037 8038 // if there is, load the constants 8039 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10 8040 movdl(xmm0, crc); // get the initial crc value 8041 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 8042 pxor(xmm7, xmm0); 8043 8044 // update the buffer pointer 8045 addl(pos, 16); 8046 //update the counter.subtract 32 instead of 16 to save one instruction from the loop 8047 subl(len, 32); 8048 jmp(L_16B_reduction_loop); 8049 8050 bind(L_less_than_32); 8051 //mov initial crc to the return value. this is necessary for zero - length buffers. 8052 movl(rax, crc); 8053 testl(len, len); 8054 jcc(Assembler::equal, L_cleanup); 8055 8056 movdl(xmm0, crc); //get the initial crc value 8057 8058 cmpl(len, 16); 8059 jcc(Assembler::equal, L_exact_16_left); 8060 jcc(Assembler::less, L_less_than_16_left); 8061 8062 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 8063 pxor(xmm7, xmm0); //xor the initial crc value 8064 addl(pos, 16); 8065 subl(len, 16); 8066 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10 8067 jmp(L_get_last_two_xmms); 8068 8069 bind(L_less_than_16_left); 8070 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first. 8071 pxor(xmm1, xmm1); 8072 movptr(tmp1, rsp); 8073 movdqu(Address(tmp1, 0 * 16), xmm1); 8074 8075 cmpl(len, 4); 8076 jcc(Assembler::less, L_only_less_than_4); 8077 8078 //backup the counter value 8079 movl(tmp2, len); 8080 cmpl(len, 8); 8081 jcc(Assembler::less, L_less_than_8_left); 8082 8083 //load 8 Bytes 8084 movq(rax, Address(buf, pos, Address::times_1, 0 * 16)); 8085 movq(Address(tmp1, 0 * 16), rax); 8086 addptr(tmp1, 8); 8087 subl(len, 8); 8088 addl(pos, 8); 8089 8090 bind(L_less_than_8_left); 8091 cmpl(len, 4); 8092 jcc(Assembler::less, L_less_than_4_left); 8093 8094 //load 4 Bytes 8095 movl(rax, Address(buf, pos, Address::times_1, 0)); 8096 movl(Address(tmp1, 0 * 16), rax); 8097 addptr(tmp1, 4); 8098 subl(len, 4); 8099 addl(pos, 4); 8100 8101 bind(L_less_than_4_left); 8102 cmpl(len, 2); 8103 jcc(Assembler::less, L_less_than_2_left); 8104 8105 // load 2 Bytes 8106 movw(rax, Address(buf, pos, Address::times_1, 0)); 8107 movl(Address(tmp1, 0 * 16), rax); 8108 addptr(tmp1, 2); 8109 subl(len, 2); 8110 addl(pos, 2); 8111 8112 bind(L_less_than_2_left); 8113 cmpl(len, 1); 8114 jcc(Assembler::less, L_zero_left); 8115 8116 // load 1 Byte 8117 movb(rax, Address(buf, pos, Address::times_1, 0)); 8118 movb(Address(tmp1, 0 * 16), rax); 8119 8120 bind(L_zero_left); 8121 movdqu(xmm7, Address(rsp, 0)); 8122 pxor(xmm7, xmm0); //xor the initial crc value 8123 8124 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 8125 movdqu(xmm0, Address(rax, tmp2)); 8126 pshufb(xmm7, xmm0); 8127 jmp(L_128_done); 8128 8129 bind(L_exact_16_left); 8130 movdqu(xmm7, Address(buf, pos, Address::times_1, 0)); 8131 pxor(xmm7, xmm0); //xor the initial crc value 8132 jmp(L_128_done); 8133 8134 bind(L_only_less_than_4); 8135 cmpl(len, 3); 8136 jcc(Assembler::less, L_only_less_than_3); 8137 8138 // load 3 Bytes 8139 movb(rax, Address(buf, pos, Address::times_1, 0)); 8140 movb(Address(tmp1, 0), rax); 8141 8142 movb(rax, Address(buf, pos, Address::times_1, 1)); 8143 movb(Address(tmp1, 1), rax); 8144 8145 movb(rax, Address(buf, pos, Address::times_1, 2)); 8146 movb(Address(tmp1, 2), rax); 8147 8148 movdqu(xmm7, Address(rsp, 0)); 8149 pxor(xmm7, xmm0); //xor the initial crc value 8150 8151 pslldq(xmm7, 0x5); 8152 jmp(L_barrett); 8153 bind(L_only_less_than_3); 8154 cmpl(len, 2); 8155 jcc(Assembler::less, L_only_less_than_2); 8156 8157 // load 2 Bytes 8158 movb(rax, Address(buf, pos, Address::times_1, 0)); 8159 movb(Address(tmp1, 0), rax); 8160 8161 movb(rax, Address(buf, pos, Address::times_1, 1)); 8162 movb(Address(tmp1, 1), rax); 8163 8164 movdqu(xmm7, Address(rsp, 0)); 8165 pxor(xmm7, xmm0); //xor the initial crc value 8166 8167 pslldq(xmm7, 0x6); 8168 jmp(L_barrett); 8169 8170 bind(L_only_less_than_2); 8171 //load 1 Byte 8172 movb(rax, Address(buf, pos, Address::times_1, 0)); 8173 movb(Address(tmp1, 0), rax); 8174 8175 movdqu(xmm7, Address(rsp, 0)); 8176 pxor(xmm7, xmm0); //xor the initial crc value 8177 8178 pslldq(xmm7, 0x7); 8179 } 8180 8181 /** 8182 * Compute CRC32 using AVX512 instructions 8183 * param crc register containing existing CRC (32-bit) 8184 * param buf register pointing to input byte buffer (byte*) 8185 * param len register containing number of bytes 8186 * param table address of crc or crc32c table 8187 * param tmp1 scratch register 8188 * param tmp2 scratch register 8189 * return rax result register 8190 * 8191 * This routine is identical for crc32c with the exception of the precomputed constant 8192 * table which will be passed as the table argument. The calculation steps are 8193 * the same for both variants. 8194 */ 8195 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) { 8196 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12); 8197 8198 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 8199 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 8200 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop; 8201 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop; 8202 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup; 8203 8204 const Register pos = r12; 8205 push(r12); 8206 subptr(rsp, 16 * 2 + 8); 8207 8208 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 8209 // context for the registers used, where all instructions below are using 128-bit mode 8210 // On EVEX without VL and BW, these instructions will all be AVX. 8211 movl(pos, 0); 8212 8213 // check if smaller than 256B 8214 cmpl(len, 256); 8215 jcc(Assembler::less, L_less_than_256); 8216 8217 // load the initial crc value 8218 movdl(xmm10, crc); 8219 8220 // receive the initial 64B data, xor the initial crc value 8221 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); 8222 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); 8223 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit); 8224 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4 8225 8226 subl(len, 256); 8227 cmpl(len, 256); 8228 jcc(Assembler::less, L_fold_128_B_loop); 8229 8230 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); 8231 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); 8232 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2 8233 subl(len, 256); 8234 8235 bind(L_fold_256_B_loop); 8236 addl(pos, 256); 8237 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64); 8238 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64); 8239 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64); 8240 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64); 8241 8242 subl(len, 256); 8243 jcc(Assembler::greaterEqual, L_fold_256_B_loop); 8244 8245 // Fold 256 into 128 8246 addl(pos, 256); 8247 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit); 8248 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit); 8249 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC 8250 8251 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit); 8252 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit); 8253 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC 8254 8255 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit); 8256 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit); 8257 8258 addl(len, 128); 8259 jmp(L_fold_128_B_register); 8260 8261 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop 8262 // loop will fold 128B at a time until we have 128 + y Bytes of buffer 8263 8264 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel 8265 bind(L_fold_128_B_loop); 8266 addl(pos, 128); 8267 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64); 8268 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64); 8269 8270 subl(len, 128); 8271 jcc(Assembler::greaterEqual, L_fold_128_B_loop); 8272 8273 addl(pos, 128); 8274 8275 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 8276 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 8277 bind(L_fold_128_B_register); 8278 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16 8279 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0 8280 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit); 8281 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit); 8282 // save last that has no multiplicand 8283 vextracti64x2(xmm7, xmm4, 3); 8284 8285 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit); 8286 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit); 8287 // Needed later in reduction loop 8288 movdqu(xmm10, Address(table, 1 * 16)); 8289 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC 8290 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC 8291 8292 // Swap 1,0,3,2 - 01 00 11 10 8293 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit); 8294 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit); 8295 vextracti128(xmm5, xmm8, 1); 8296 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit); 8297 8298 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop 8299 // instead of a cmp instruction, we use the negative flag with the jl instruction 8300 addl(len, 128 - 16); 8301 jcc(Assembler::less, L_final_reduction_for_128); 8302 8303 bind(L_16B_reduction_loop); 8304 vpclmulqdq(xmm8, xmm7, xmm10, 0x01); 8305 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 8306 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 8307 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16)); 8308 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8309 addl(pos, 16); 8310 subl(len, 16); 8311 jcc(Assembler::greaterEqual, L_16B_reduction_loop); 8312 8313 bind(L_final_reduction_for_128); 8314 addl(len, 16); 8315 jcc(Assembler::equal, L_128_done); 8316 8317 bind(L_get_last_two_xmms); 8318 movdqu(xmm2, xmm7); 8319 addl(pos, len); 8320 movdqu(xmm1, Address(buf, pos, Address::times_1, -16)); 8321 subl(pos, len); 8322 8323 // get rid of the extra data that was loaded before 8324 // load the shift constant 8325 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 8326 movdqu(xmm0, Address(rax, len)); 8327 addl(rax, len); 8328 8329 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8330 //Change mask to 512 8331 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2); 8332 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit); 8333 8334 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit); 8335 vpclmulqdq(xmm8, xmm7, xmm10, 0x01); 8336 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 8337 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 8338 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit); 8339 8340 bind(L_128_done); 8341 // compute crc of a 128-bit value 8342 movdqu(xmm10, Address(table, 3 * 16)); 8343 movdqu(xmm0, xmm7); 8344 8345 // 64b fold 8346 vpclmulqdq(xmm7, xmm7, xmm10, 0x0); 8347 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit); 8348 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8349 8350 // 32b fold 8351 movdqu(xmm0, xmm7); 8352 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit); 8353 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 8354 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8355 jmp(L_barrett); 8356 8357 bind(L_less_than_256); 8358 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup); 8359 8360 //barrett reduction 8361 bind(L_barrett); 8362 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2); 8363 movdqu(xmm1, xmm7); 8364 movdqu(xmm2, xmm7); 8365 movdqu(xmm10, Address(table, 4 * 16)); 8366 8367 pclmulqdq(xmm7, xmm10, 0x0); 8368 pxor(xmm7, xmm2); 8369 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2); 8370 movdqu(xmm2, xmm7); 8371 pclmulqdq(xmm7, xmm10, 0x10); 8372 pxor(xmm7, xmm2); 8373 pxor(xmm7, xmm1); 8374 pextrd(crc, xmm7, 2); 8375 8376 bind(L_cleanup); 8377 addptr(rsp, 16 * 2 + 8); 8378 pop(r12); 8379 } 8380 8381 // S. Gueron / Information Processing Letters 112 (2012) 184 8382 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. 8383 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. 8384 // Output: the 64-bit carry-less product of B * CONST 8385 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n, 8386 Register tmp1, Register tmp2, Register tmp3) { 8387 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 8388 if (n > 0) { 8389 addq(tmp3, n * 256 * 8); 8390 } 8391 // Q1 = TABLEExt[n][B & 0xFF]; 8392 movl(tmp1, in); 8393 andl(tmp1, 0x000000FF); 8394 shll(tmp1, 3); 8395 addq(tmp1, tmp3); 8396 movq(tmp1, Address(tmp1, 0)); 8397 8398 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 8399 movl(tmp2, in); 8400 shrl(tmp2, 8); 8401 andl(tmp2, 0x000000FF); 8402 shll(tmp2, 3); 8403 addq(tmp2, tmp3); 8404 movq(tmp2, Address(tmp2, 0)); 8405 8406 shlq(tmp2, 8); 8407 xorq(tmp1, tmp2); 8408 8409 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 8410 movl(tmp2, in); 8411 shrl(tmp2, 16); 8412 andl(tmp2, 0x000000FF); 8413 shll(tmp2, 3); 8414 addq(tmp2, tmp3); 8415 movq(tmp2, Address(tmp2, 0)); 8416 8417 shlq(tmp2, 16); 8418 xorq(tmp1, tmp2); 8419 8420 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 8421 shrl(in, 24); 8422 andl(in, 0x000000FF); 8423 shll(in, 3); 8424 addq(in, tmp3); 8425 movq(in, Address(in, 0)); 8426 8427 shlq(in, 24); 8428 xorq(in, tmp1); 8429 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 8430 } 8431 8432 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 8433 Register in_out, 8434 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 8435 XMMRegister w_xtmp2, 8436 Register tmp1, 8437 Register n_tmp2, Register n_tmp3) { 8438 if (is_pclmulqdq_supported) { 8439 movdl(w_xtmp1, in_out); // modified blindly 8440 8441 movl(tmp1, const_or_pre_comp_const_index); 8442 movdl(w_xtmp2, tmp1); 8443 pclmulqdq(w_xtmp1, w_xtmp2, 0); 8444 8445 movdq(in_out, w_xtmp1); 8446 } else { 8447 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3); 8448 } 8449 } 8450 8451 // Recombination Alternative 2: No bit-reflections 8452 // T1 = (CRC_A * U1) << 1 8453 // T2 = (CRC_B * U2) << 1 8454 // C1 = T1 >> 32 8455 // C2 = T2 >> 32 8456 // T1 = T1 & 0xFFFFFFFF 8457 // T2 = T2 & 0xFFFFFFFF 8458 // T1 = CRC32(0, T1) 8459 // T2 = CRC32(0, T2) 8460 // C1 = C1 ^ T1 8461 // C2 = C2 ^ T2 8462 // CRC = C1 ^ C2 ^ CRC_C 8463 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 8464 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8465 Register tmp1, Register tmp2, 8466 Register n_tmp3) { 8467 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8468 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8469 shlq(in_out, 1); 8470 movl(tmp1, in_out); 8471 shrq(in_out, 32); 8472 xorl(tmp2, tmp2); 8473 crc32(tmp2, tmp1, 4); 8474 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here 8475 shlq(in1, 1); 8476 movl(tmp1, in1); 8477 shrq(in1, 32); 8478 xorl(tmp2, tmp2); 8479 crc32(tmp2, tmp1, 4); 8480 xorl(in1, tmp2); 8481 xorl(in_out, in1); 8482 xorl(in_out, in2); 8483 } 8484 8485 // Set N to predefined value 8486 // Subtract from a lenght of a buffer 8487 // execute in a loop: 8488 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 8489 // for i = 1 to N do 8490 // CRC_A = CRC32(CRC_A, A[i]) 8491 // CRC_B = CRC32(CRC_B, B[i]) 8492 // CRC_C = CRC32(CRC_C, C[i]) 8493 // end for 8494 // Recombine 8495 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 8496 Register in_out1, Register in_out2, Register in_out3, 8497 Register tmp1, Register tmp2, Register tmp3, 8498 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8499 Register tmp4, Register tmp5, 8500 Register n_tmp6) { 8501 Label L_processPartitions; 8502 Label L_processPartition; 8503 Label L_exit; 8504 8505 bind(L_processPartitions); 8506 cmpl(in_out1, 3 * size); 8507 jcc(Assembler::less, L_exit); 8508 xorl(tmp1, tmp1); 8509 xorl(tmp2, tmp2); 8510 movq(tmp3, in_out2); 8511 addq(tmp3, size); 8512 8513 bind(L_processPartition); 8514 crc32(in_out3, Address(in_out2, 0), 8); 8515 crc32(tmp1, Address(in_out2, size), 8); 8516 crc32(tmp2, Address(in_out2, size * 2), 8); 8517 addq(in_out2, 8); 8518 cmpq(in_out2, tmp3); 8519 jcc(Assembler::less, L_processPartition); 8520 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 8521 w_xtmp1, w_xtmp2, w_xtmp3, 8522 tmp4, tmp5, 8523 n_tmp6); 8524 addq(in_out2, 2 * size); 8525 subl(in_out1, 3 * size); 8526 jmp(L_processPartitions); 8527 8528 bind(L_exit); 8529 } 8530 #else 8531 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n, 8532 Register tmp1, Register tmp2, Register tmp3, 8533 XMMRegister xtmp1, XMMRegister xtmp2) { 8534 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 8535 if (n > 0) { 8536 addl(tmp3, n * 256 * 8); 8537 } 8538 // Q1 = TABLEExt[n][B & 0xFF]; 8539 movl(tmp1, in_out); 8540 andl(tmp1, 0x000000FF); 8541 shll(tmp1, 3); 8542 addl(tmp1, tmp3); 8543 movq(xtmp1, Address(tmp1, 0)); 8544 8545 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 8546 movl(tmp2, in_out); 8547 shrl(tmp2, 8); 8548 andl(tmp2, 0x000000FF); 8549 shll(tmp2, 3); 8550 addl(tmp2, tmp3); 8551 movq(xtmp2, Address(tmp2, 0)); 8552 8553 psllq(xtmp2, 8); 8554 pxor(xtmp1, xtmp2); 8555 8556 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 8557 movl(tmp2, in_out); 8558 shrl(tmp2, 16); 8559 andl(tmp2, 0x000000FF); 8560 shll(tmp2, 3); 8561 addl(tmp2, tmp3); 8562 movq(xtmp2, Address(tmp2, 0)); 8563 8564 psllq(xtmp2, 16); 8565 pxor(xtmp1, xtmp2); 8566 8567 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 8568 shrl(in_out, 24); 8569 andl(in_out, 0x000000FF); 8570 shll(in_out, 3); 8571 addl(in_out, tmp3); 8572 movq(xtmp2, Address(in_out, 0)); 8573 8574 psllq(xtmp2, 24); 8575 pxor(xtmp1, xtmp2); // Result in CXMM 8576 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 8577 } 8578 8579 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 8580 Register in_out, 8581 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 8582 XMMRegister w_xtmp2, 8583 Register tmp1, 8584 Register n_tmp2, Register n_tmp3) { 8585 if (is_pclmulqdq_supported) { 8586 movdl(w_xtmp1, in_out); 8587 8588 movl(tmp1, const_or_pre_comp_const_index); 8589 movdl(w_xtmp2, tmp1); 8590 pclmulqdq(w_xtmp1, w_xtmp2, 0); 8591 // Keep result in XMM since GPR is 32 bit in length 8592 } else { 8593 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2); 8594 } 8595 } 8596 8597 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 8598 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8599 Register tmp1, Register tmp2, 8600 Register n_tmp3) { 8601 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8602 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8603 8604 psllq(w_xtmp1, 1); 8605 movdl(tmp1, w_xtmp1); 8606 psrlq(w_xtmp1, 32); 8607 movdl(in_out, w_xtmp1); 8608 8609 xorl(tmp2, tmp2); 8610 crc32(tmp2, tmp1, 4); 8611 xorl(in_out, tmp2); 8612 8613 psllq(w_xtmp2, 1); 8614 movdl(tmp1, w_xtmp2); 8615 psrlq(w_xtmp2, 32); 8616 movdl(in1, w_xtmp2); 8617 8618 xorl(tmp2, tmp2); 8619 crc32(tmp2, tmp1, 4); 8620 xorl(in1, tmp2); 8621 xorl(in_out, in1); 8622 xorl(in_out, in2); 8623 } 8624 8625 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 8626 Register in_out1, Register in_out2, Register in_out3, 8627 Register tmp1, Register tmp2, Register tmp3, 8628 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8629 Register tmp4, Register tmp5, 8630 Register n_tmp6) { 8631 Label L_processPartitions; 8632 Label L_processPartition; 8633 Label L_exit; 8634 8635 bind(L_processPartitions); 8636 cmpl(in_out1, 3 * size); 8637 jcc(Assembler::less, L_exit); 8638 xorl(tmp1, tmp1); 8639 xorl(tmp2, tmp2); 8640 movl(tmp3, in_out2); 8641 addl(tmp3, size); 8642 8643 bind(L_processPartition); 8644 crc32(in_out3, Address(in_out2, 0), 4); 8645 crc32(tmp1, Address(in_out2, size), 4); 8646 crc32(tmp2, Address(in_out2, size*2), 4); 8647 crc32(in_out3, Address(in_out2, 0+4), 4); 8648 crc32(tmp1, Address(in_out2, size+4), 4); 8649 crc32(tmp2, Address(in_out2, size*2+4), 4); 8650 addl(in_out2, 8); 8651 cmpl(in_out2, tmp3); 8652 jcc(Assembler::less, L_processPartition); 8653 8654 push(tmp3); 8655 push(in_out1); 8656 push(in_out2); 8657 tmp4 = tmp3; 8658 tmp5 = in_out1; 8659 n_tmp6 = in_out2; 8660 8661 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 8662 w_xtmp1, w_xtmp2, w_xtmp3, 8663 tmp4, tmp5, 8664 n_tmp6); 8665 8666 pop(in_out2); 8667 pop(in_out1); 8668 pop(tmp3); 8669 8670 addl(in_out2, 2 * size); 8671 subl(in_out1, 3 * size); 8672 jmp(L_processPartitions); 8673 8674 bind(L_exit); 8675 } 8676 #endif //LP64 8677 8678 #ifdef _LP64 8679 // Algorithm 2: Pipelined usage of the CRC32 instruction. 8680 // Input: A buffer I of L bytes. 8681 // Output: the CRC32C value of the buffer. 8682 // Notations: 8683 // Write L = 24N + r, with N = floor (L/24). 8684 // r = L mod 24 (0 <= r < 24). 8685 // Consider I as the concatenation of A|B|C|R, where A, B, C, each, 8686 // N quadwords, and R consists of r bytes. 8687 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 8688 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 8689 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 8690 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 8691 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 8692 Register tmp1, Register tmp2, Register tmp3, 8693 Register tmp4, Register tmp5, Register tmp6, 8694 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8695 bool is_pclmulqdq_supported) { 8696 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 8697 Label L_wordByWord; 8698 Label L_byteByByteProlog; 8699 Label L_byteByByte; 8700 Label L_exit; 8701 8702 if (is_pclmulqdq_supported ) { 8703 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; 8704 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1); 8705 8706 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); 8707 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); 8708 8709 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); 8710 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); 8711 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); 8712 } else { 8713 const_or_pre_comp_const_index[0] = 1; 8714 const_or_pre_comp_const_index[1] = 0; 8715 8716 const_or_pre_comp_const_index[2] = 3; 8717 const_or_pre_comp_const_index[3] = 2; 8718 8719 const_or_pre_comp_const_index[4] = 5; 8720 const_or_pre_comp_const_index[5] = 4; 8721 } 8722 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 8723 in2, in1, in_out, 8724 tmp1, tmp2, tmp3, 8725 w_xtmp1, w_xtmp2, w_xtmp3, 8726 tmp4, tmp5, 8727 tmp6); 8728 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 8729 in2, in1, in_out, 8730 tmp1, tmp2, tmp3, 8731 w_xtmp1, w_xtmp2, w_xtmp3, 8732 tmp4, tmp5, 8733 tmp6); 8734 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 8735 in2, in1, in_out, 8736 tmp1, tmp2, tmp3, 8737 w_xtmp1, w_xtmp2, w_xtmp3, 8738 tmp4, tmp5, 8739 tmp6); 8740 movl(tmp1, in2); 8741 andl(tmp1, 0x00000007); 8742 negl(tmp1); 8743 addl(tmp1, in2); 8744 addq(tmp1, in1); 8745 8746 BIND(L_wordByWord); 8747 cmpq(in1, tmp1); 8748 jcc(Assembler::greaterEqual, L_byteByByteProlog); 8749 crc32(in_out, Address(in1, 0), 4); 8750 addq(in1, 4); 8751 jmp(L_wordByWord); 8752 8753 BIND(L_byteByByteProlog); 8754 andl(in2, 0x00000007); 8755 movl(tmp2, 1); 8756 8757 BIND(L_byteByByte); 8758 cmpl(tmp2, in2); 8759 jccb(Assembler::greater, L_exit); 8760 crc32(in_out, Address(in1, 0), 1); 8761 incq(in1); 8762 incl(tmp2); 8763 jmp(L_byteByByte); 8764 8765 BIND(L_exit); 8766 } 8767 #else 8768 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 8769 Register tmp1, Register tmp2, Register tmp3, 8770 Register tmp4, Register tmp5, Register tmp6, 8771 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8772 bool is_pclmulqdq_supported) { 8773 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 8774 Label L_wordByWord; 8775 Label L_byteByByteProlog; 8776 Label L_byteByByte; 8777 Label L_exit; 8778 8779 if (is_pclmulqdq_supported) { 8780 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; 8781 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1); 8782 8783 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); 8784 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); 8785 8786 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); 8787 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); 8788 } else { 8789 const_or_pre_comp_const_index[0] = 1; 8790 const_or_pre_comp_const_index[1] = 0; 8791 8792 const_or_pre_comp_const_index[2] = 3; 8793 const_or_pre_comp_const_index[3] = 2; 8794 8795 const_or_pre_comp_const_index[4] = 5; 8796 const_or_pre_comp_const_index[5] = 4; 8797 } 8798 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 8799 in2, in1, in_out, 8800 tmp1, tmp2, tmp3, 8801 w_xtmp1, w_xtmp2, w_xtmp3, 8802 tmp4, tmp5, 8803 tmp6); 8804 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 8805 in2, in1, in_out, 8806 tmp1, tmp2, tmp3, 8807 w_xtmp1, w_xtmp2, w_xtmp3, 8808 tmp4, tmp5, 8809 tmp6); 8810 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 8811 in2, in1, in_out, 8812 tmp1, tmp2, tmp3, 8813 w_xtmp1, w_xtmp2, w_xtmp3, 8814 tmp4, tmp5, 8815 tmp6); 8816 movl(tmp1, in2); 8817 andl(tmp1, 0x00000007); 8818 negl(tmp1); 8819 addl(tmp1, in2); 8820 addl(tmp1, in1); 8821 8822 BIND(L_wordByWord); 8823 cmpl(in1, tmp1); 8824 jcc(Assembler::greaterEqual, L_byteByByteProlog); 8825 crc32(in_out, Address(in1,0), 4); 8826 addl(in1, 4); 8827 jmp(L_wordByWord); 8828 8829 BIND(L_byteByByteProlog); 8830 andl(in2, 0x00000007); 8831 movl(tmp2, 1); 8832 8833 BIND(L_byteByByte); 8834 cmpl(tmp2, in2); 8835 jccb(Assembler::greater, L_exit); 8836 movb(tmp1, Address(in1, 0)); 8837 crc32(in_out, tmp1, 1); 8838 incl(in1); 8839 incl(tmp2); 8840 jmp(L_byteByByte); 8841 8842 BIND(L_exit); 8843 } 8844 #endif // LP64 8845 #undef BIND 8846 #undef BLOCK_COMMENT 8847 8848 // Compress char[] array to byte[]. 8849 // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java 8850 // @IntrinsicCandidate 8851 // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { 8852 // for (int i = 0; i < len; i++) { 8853 // int c = src[srcOff++]; 8854 // if (c >>> 8 != 0) { 8855 // return 0; 8856 // } 8857 // dst[dstOff++] = (byte)c; 8858 // } 8859 // return len; 8860 // } 8861 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 8862 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 8863 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 8864 Register tmp5, Register result, KRegister mask1, KRegister mask2) { 8865 Label copy_chars_loop, return_length, return_zero, done; 8866 8867 // rsi: src 8868 // rdi: dst 8869 // rdx: len 8870 // rcx: tmp5 8871 // rax: result 8872 8873 // rsi holds start addr of source char[] to be compressed 8874 // rdi holds start addr of destination byte[] 8875 // rdx holds length 8876 8877 assert(len != result, ""); 8878 8879 // save length for return 8880 push(len); 8881 8882 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 8883 VM_Version::supports_avx512vlbw() && 8884 VM_Version::supports_bmi2()) { 8885 8886 Label copy_32_loop, copy_loop_tail, below_threshold; 8887 8888 // alignment 8889 Label post_alignment; 8890 8891 // if length of the string is less than 16, handle it in an old fashioned way 8892 testl(len, -32); 8893 jcc(Assembler::zero, below_threshold); 8894 8895 // First check whether a character is compressable ( <= 0xFF). 8896 // Create mask to test for Unicode chars inside zmm vector 8897 movl(result, 0x00FF); 8898 evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit); 8899 8900 testl(len, -64); 8901 jcc(Assembler::zero, post_alignment); 8902 8903 movl(tmp5, dst); 8904 andl(tmp5, (32 - 1)); 8905 negl(tmp5); 8906 andl(tmp5, (32 - 1)); 8907 8908 // bail out when there is nothing to be done 8909 testl(tmp5, 0xFFFFFFFF); 8910 jcc(Assembler::zero, post_alignment); 8911 8912 // ~(~0 << len), where len is the # of remaining elements to process 8913 movl(result, 0xFFFFFFFF); 8914 shlxl(result, result, tmp5); 8915 notl(result); 8916 kmovdl(mask2, result); 8917 8918 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); 8919 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); 8920 ktestd(mask1, mask2); 8921 jcc(Assembler::carryClear, return_zero); 8922 8923 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); 8924 8925 addptr(src, tmp5); 8926 addptr(src, tmp5); 8927 addptr(dst, tmp5); 8928 subl(len, tmp5); 8929 8930 bind(post_alignment); 8931 // end of alignment 8932 8933 movl(tmp5, len); 8934 andl(tmp5, (32 - 1)); // tail count (in chars) 8935 andl(len, ~(32 - 1)); // vector count (in chars) 8936 jcc(Assembler::zero, copy_loop_tail); 8937 8938 lea(src, Address(src, len, Address::times_2)); 8939 lea(dst, Address(dst, len, Address::times_1)); 8940 negptr(len); 8941 8942 bind(copy_32_loop); 8943 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit); 8944 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); 8945 kortestdl(mask1, mask1); 8946 jcc(Assembler::carryClear, return_zero); 8947 8948 // All elements in current processed chunk are valid candidates for 8949 // compression. Write a truncated byte elements to the memory. 8950 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); 8951 addptr(len, 32); 8952 jcc(Assembler::notZero, copy_32_loop); 8953 8954 bind(copy_loop_tail); 8955 // bail out when there is nothing to be done 8956 testl(tmp5, 0xFFFFFFFF); 8957 jcc(Assembler::zero, return_length); 8958 8959 movl(len, tmp5); 8960 8961 // ~(~0 << len), where len is the # of remaining elements to process 8962 movl(result, 0xFFFFFFFF); 8963 shlxl(result, result, len); 8964 notl(result); 8965 8966 kmovdl(mask2, result); 8967 8968 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); 8969 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); 8970 ktestd(mask1, mask2); 8971 jcc(Assembler::carryClear, return_zero); 8972 8973 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); 8974 jmp(return_length); 8975 8976 bind(below_threshold); 8977 } 8978 8979 if (UseSSE42Intrinsics) { 8980 Label copy_32_loop, copy_16, copy_tail; 8981 8982 movl(result, len); 8983 8984 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors 8985 8986 // vectored compression 8987 andl(len, 0xfffffff0); // vector count (in chars) 8988 andl(result, 0x0000000f); // tail count (in chars) 8989 testl(len, len); 8990 jcc(Assembler::zero, copy_16); 8991 8992 // compress 16 chars per iter 8993 movdl(tmp1Reg, tmp5); 8994 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg 8995 pxor(tmp4Reg, tmp4Reg); 8996 8997 lea(src, Address(src, len, Address::times_2)); 8998 lea(dst, Address(dst, len, Address::times_1)); 8999 negptr(len); 9000 9001 bind(copy_32_loop); 9002 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters 9003 por(tmp4Reg, tmp2Reg); 9004 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters 9005 por(tmp4Reg, tmp3Reg); 9006 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector 9007 jcc(Assembler::notZero, return_zero); 9008 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte 9009 movdqu(Address(dst, len, Address::times_1), tmp2Reg); 9010 addptr(len, 16); 9011 jcc(Assembler::notZero, copy_32_loop); 9012 9013 // compress next vector of 8 chars (if any) 9014 bind(copy_16); 9015 movl(len, result); 9016 andl(len, 0xfffffff8); // vector count (in chars) 9017 andl(result, 0x00000007); // tail count (in chars) 9018 testl(len, len); 9019 jccb(Assembler::zero, copy_tail); 9020 9021 movdl(tmp1Reg, tmp5); 9022 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg 9023 pxor(tmp3Reg, tmp3Reg); 9024 9025 movdqu(tmp2Reg, Address(src, 0)); 9026 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector 9027 jccb(Assembler::notZero, return_zero); 9028 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte 9029 movq(Address(dst, 0), tmp2Reg); 9030 addptr(src, 16); 9031 addptr(dst, 8); 9032 9033 bind(copy_tail); 9034 movl(len, result); 9035 } 9036 // compress 1 char per iter 9037 testl(len, len); 9038 jccb(Assembler::zero, return_length); 9039 lea(src, Address(src, len, Address::times_2)); 9040 lea(dst, Address(dst, len, Address::times_1)); 9041 negptr(len); 9042 9043 bind(copy_chars_loop); 9044 load_unsigned_short(result, Address(src, len, Address::times_2)); 9045 testl(result, 0xff00); // check if Unicode char 9046 jccb(Assembler::notZero, return_zero); 9047 movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte 9048 increment(len); 9049 jcc(Assembler::notZero, copy_chars_loop); 9050 9051 // if compression succeeded, return length 9052 bind(return_length); 9053 pop(result); 9054 jmpb(done); 9055 9056 // if compression failed, return 0 9057 bind(return_zero); 9058 xorl(result, result); 9059 addptr(rsp, wordSize); 9060 9061 bind(done); 9062 } 9063 9064 // Inflate byte[] array to char[]. 9065 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java 9066 // @IntrinsicCandidate 9067 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) { 9068 // for (int i = 0; i < len; i++) { 9069 // dst[dstOff++] = (char)(src[srcOff++] & 0xff); 9070 // } 9071 // } 9072 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 9073 XMMRegister tmp1, Register tmp2, KRegister mask) { 9074 Label copy_chars_loop, done, below_threshold, avx3_threshold; 9075 // rsi: src 9076 // rdi: dst 9077 // rdx: len 9078 // rcx: tmp2 9079 9080 // rsi holds start addr of source byte[] to be inflated 9081 // rdi holds start addr of destination char[] 9082 // rdx holds length 9083 assert_different_registers(src, dst, len, tmp2); 9084 movl(tmp2, len); 9085 if ((UseAVX > 2) && // AVX512 9086 VM_Version::supports_avx512vlbw() && 9087 VM_Version::supports_bmi2()) { 9088 9089 Label copy_32_loop, copy_tail; 9090 Register tmp3_aliased = len; 9091 9092 // if length of the string is less than 16, handle it in an old fashioned way 9093 testl(len, -16); 9094 jcc(Assembler::zero, below_threshold); 9095 9096 testl(len, -1 * AVX3Threshold); 9097 jcc(Assembler::zero, avx3_threshold); 9098 9099 // In order to use only one arithmetic operation for the main loop we use 9100 // this pre-calculation 9101 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop 9102 andl(len, -32); // vector count 9103 jccb(Assembler::zero, copy_tail); 9104 9105 lea(src, Address(src, len, Address::times_1)); 9106 lea(dst, Address(dst, len, Address::times_2)); 9107 negptr(len); 9108 9109 9110 // inflate 32 chars per iter 9111 bind(copy_32_loop); 9112 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); 9113 evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit); 9114 addptr(len, 32); 9115 jcc(Assembler::notZero, copy_32_loop); 9116 9117 bind(copy_tail); 9118 // bail out when there is nothing to be done 9119 testl(tmp2, -1); // we don't destroy the contents of tmp2 here 9120 jcc(Assembler::zero, done); 9121 9122 // ~(~0 << length), where length is the # of remaining elements to process 9123 movl(tmp3_aliased, -1); 9124 shlxl(tmp3_aliased, tmp3_aliased, tmp2); 9125 notl(tmp3_aliased); 9126 kmovdl(mask, tmp3_aliased); 9127 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit); 9128 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit); 9129 9130 jmp(done); 9131 bind(avx3_threshold); 9132 } 9133 if (UseSSE42Intrinsics) { 9134 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; 9135 9136 if (UseAVX > 1) { 9137 andl(tmp2, (16 - 1)); 9138 andl(len, -16); 9139 jccb(Assembler::zero, copy_new_tail); 9140 } else { 9141 andl(tmp2, 0x00000007); // tail count (in chars) 9142 andl(len, 0xfffffff8); // vector count (in chars) 9143 jccb(Assembler::zero, copy_tail); 9144 } 9145 9146 // vectored inflation 9147 lea(src, Address(src, len, Address::times_1)); 9148 lea(dst, Address(dst, len, Address::times_2)); 9149 negptr(len); 9150 9151 if (UseAVX > 1) { 9152 bind(copy_16_loop); 9153 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit); 9154 vmovdqu(Address(dst, len, Address::times_2), tmp1); 9155 addptr(len, 16); 9156 jcc(Assembler::notZero, copy_16_loop); 9157 9158 bind(below_threshold); 9159 bind(copy_new_tail); 9160 movl(len, tmp2); 9161 andl(tmp2, 0x00000007); 9162 andl(len, 0xFFFFFFF8); 9163 jccb(Assembler::zero, copy_tail); 9164 9165 pmovzxbw(tmp1, Address(src, 0)); 9166 movdqu(Address(dst, 0), tmp1); 9167 addptr(src, 8); 9168 addptr(dst, 2 * 8); 9169 9170 jmp(copy_tail, true); 9171 } 9172 9173 // inflate 8 chars per iter 9174 bind(copy_8_loop); 9175 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words 9176 movdqu(Address(dst, len, Address::times_2), tmp1); 9177 addptr(len, 8); 9178 jcc(Assembler::notZero, copy_8_loop); 9179 9180 bind(copy_tail); 9181 movl(len, tmp2); 9182 9183 cmpl(len, 4); 9184 jccb(Assembler::less, copy_bytes); 9185 9186 movdl(tmp1, Address(src, 0)); // load 4 byte chars 9187 pmovzxbw(tmp1, tmp1); 9188 movq(Address(dst, 0), tmp1); 9189 subptr(len, 4); 9190 addptr(src, 4); 9191 addptr(dst, 8); 9192 9193 bind(copy_bytes); 9194 } else { 9195 bind(below_threshold); 9196 } 9197 9198 testl(len, len); 9199 jccb(Assembler::zero, done); 9200 lea(src, Address(src, len, Address::times_1)); 9201 lea(dst, Address(dst, len, Address::times_2)); 9202 negptr(len); 9203 9204 // inflate 1 char per iter 9205 bind(copy_chars_loop); 9206 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char 9207 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word 9208 increment(len); 9209 jcc(Assembler::notZero, copy_chars_loop); 9210 9211 bind(done); 9212 } 9213 9214 9215 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { 9216 switch(type) { 9217 case T_BYTE: 9218 case T_BOOLEAN: 9219 evmovdqub(dst, kmask, src, false, vector_len); 9220 break; 9221 case T_CHAR: 9222 case T_SHORT: 9223 evmovdquw(dst, kmask, src, false, vector_len); 9224 break; 9225 case T_INT: 9226 case T_FLOAT: 9227 evmovdqul(dst, kmask, src, false, vector_len); 9228 break; 9229 case T_LONG: 9230 case T_DOUBLE: 9231 evmovdquq(dst, kmask, src, false, vector_len); 9232 break; 9233 default: 9234 fatal("Unexpected type argument %s", type2name(type)); 9235 break; 9236 } 9237 } 9238 9239 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { 9240 switch(type) { 9241 case T_BYTE: 9242 case T_BOOLEAN: 9243 evmovdqub(dst, kmask, src, true, vector_len); 9244 break; 9245 case T_CHAR: 9246 case T_SHORT: 9247 evmovdquw(dst, kmask, src, true, vector_len); 9248 break; 9249 case T_INT: 9250 case T_FLOAT: 9251 evmovdqul(dst, kmask, src, true, vector_len); 9252 break; 9253 case T_LONG: 9254 case T_DOUBLE: 9255 evmovdquq(dst, kmask, src, true, vector_len); 9256 break; 9257 default: 9258 fatal("Unexpected type argument %s", type2name(type)); 9259 break; 9260 } 9261 } 9262 9263 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) { 9264 switch(masklen) { 9265 case 2: 9266 knotbl(dst, src); 9267 movl(rtmp, 3); 9268 kmovbl(ktmp, rtmp); 9269 kandbl(dst, ktmp, dst); 9270 break; 9271 case 4: 9272 knotbl(dst, src); 9273 movl(rtmp, 15); 9274 kmovbl(ktmp, rtmp); 9275 kandbl(dst, ktmp, dst); 9276 break; 9277 case 8: 9278 knotbl(dst, src); 9279 break; 9280 case 16: 9281 knotwl(dst, src); 9282 break; 9283 case 32: 9284 knotdl(dst, src); 9285 break; 9286 case 64: 9287 knotql(dst, src); 9288 break; 9289 default: 9290 fatal("Unexpected vector length %d", masklen); 9291 break; 9292 } 9293 } 9294 9295 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 9296 switch(type) { 9297 case T_BOOLEAN: 9298 case T_BYTE: 9299 kandbl(dst, src1, src2); 9300 break; 9301 case T_CHAR: 9302 case T_SHORT: 9303 kandwl(dst, src1, src2); 9304 break; 9305 case T_INT: 9306 case T_FLOAT: 9307 kanddl(dst, src1, src2); 9308 break; 9309 case T_LONG: 9310 case T_DOUBLE: 9311 kandql(dst, src1, src2); 9312 break; 9313 default: 9314 fatal("Unexpected type argument %s", type2name(type)); 9315 break; 9316 } 9317 } 9318 9319 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 9320 switch(type) { 9321 case T_BOOLEAN: 9322 case T_BYTE: 9323 korbl(dst, src1, src2); 9324 break; 9325 case T_CHAR: 9326 case T_SHORT: 9327 korwl(dst, src1, src2); 9328 break; 9329 case T_INT: 9330 case T_FLOAT: 9331 kordl(dst, src1, src2); 9332 break; 9333 case T_LONG: 9334 case T_DOUBLE: 9335 korql(dst, src1, src2); 9336 break; 9337 default: 9338 fatal("Unexpected type argument %s", type2name(type)); 9339 break; 9340 } 9341 } 9342 9343 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 9344 switch(type) { 9345 case T_BOOLEAN: 9346 case T_BYTE: 9347 kxorbl(dst, src1, src2); 9348 break; 9349 case T_CHAR: 9350 case T_SHORT: 9351 kxorwl(dst, src1, src2); 9352 break; 9353 case T_INT: 9354 case T_FLOAT: 9355 kxordl(dst, src1, src2); 9356 break; 9357 case T_LONG: 9358 case T_DOUBLE: 9359 kxorql(dst, src1, src2); 9360 break; 9361 default: 9362 fatal("Unexpected type argument %s", type2name(type)); 9363 break; 9364 } 9365 } 9366 9367 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9368 switch(type) { 9369 case T_BOOLEAN: 9370 case T_BYTE: 9371 evpermb(dst, mask, nds, src, merge, vector_len); break; 9372 case T_CHAR: 9373 case T_SHORT: 9374 evpermw(dst, mask, nds, src, merge, vector_len); break; 9375 case T_INT: 9376 case T_FLOAT: 9377 evpermd(dst, mask, nds, src, merge, vector_len); break; 9378 case T_LONG: 9379 case T_DOUBLE: 9380 evpermq(dst, mask, nds, src, merge, vector_len); break; 9381 default: 9382 fatal("Unexpected type argument %s", type2name(type)); break; 9383 } 9384 } 9385 9386 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9387 switch(type) { 9388 case T_BOOLEAN: 9389 case T_BYTE: 9390 evpermb(dst, mask, nds, src, merge, vector_len); break; 9391 case T_CHAR: 9392 case T_SHORT: 9393 evpermw(dst, mask, nds, src, merge, vector_len); break; 9394 case T_INT: 9395 case T_FLOAT: 9396 evpermd(dst, mask, nds, src, merge, vector_len); break; 9397 case T_LONG: 9398 case T_DOUBLE: 9399 evpermq(dst, mask, nds, src, merge, vector_len); break; 9400 default: 9401 fatal("Unexpected type argument %s", type2name(type)); break; 9402 } 9403 } 9404 9405 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9406 switch(type) { 9407 case T_BYTE: 9408 evpminsb(dst, mask, nds, src, merge, vector_len); break; 9409 case T_SHORT: 9410 evpminsw(dst, mask, nds, src, merge, vector_len); break; 9411 case T_INT: 9412 evpminsd(dst, mask, nds, src, merge, vector_len); break; 9413 case T_LONG: 9414 evpminsq(dst, mask, nds, src, merge, vector_len); break; 9415 default: 9416 fatal("Unexpected type argument %s", type2name(type)); break; 9417 } 9418 } 9419 9420 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9421 switch(type) { 9422 case T_BYTE: 9423 evpmaxsb(dst, mask, nds, src, merge, vector_len); break; 9424 case T_SHORT: 9425 evpmaxsw(dst, mask, nds, src, merge, vector_len); break; 9426 case T_INT: 9427 evpmaxsd(dst, mask, nds, src, merge, vector_len); break; 9428 case T_LONG: 9429 evpmaxsq(dst, mask, nds, src, merge, vector_len); break; 9430 default: 9431 fatal("Unexpected type argument %s", type2name(type)); break; 9432 } 9433 } 9434 9435 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9436 switch(type) { 9437 case T_BYTE: 9438 evpminsb(dst, mask, nds, src, merge, vector_len); break; 9439 case T_SHORT: 9440 evpminsw(dst, mask, nds, src, merge, vector_len); break; 9441 case T_INT: 9442 evpminsd(dst, mask, nds, src, merge, vector_len); break; 9443 case T_LONG: 9444 evpminsq(dst, mask, nds, src, merge, vector_len); break; 9445 default: 9446 fatal("Unexpected type argument %s", type2name(type)); break; 9447 } 9448 } 9449 9450 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9451 switch(type) { 9452 case T_BYTE: 9453 evpmaxsb(dst, mask, nds, src, merge, vector_len); break; 9454 case T_SHORT: 9455 evpmaxsw(dst, mask, nds, src, merge, vector_len); break; 9456 case T_INT: 9457 evpmaxsd(dst, mask, nds, src, merge, vector_len); break; 9458 case T_LONG: 9459 evpmaxsq(dst, mask, nds, src, merge, vector_len); break; 9460 default: 9461 fatal("Unexpected type argument %s", type2name(type)); break; 9462 } 9463 } 9464 9465 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9466 switch(type) { 9467 case T_INT: 9468 evpxord(dst, mask, nds, src, merge, vector_len); break; 9469 case T_LONG: 9470 evpxorq(dst, mask, nds, src, merge, vector_len); break; 9471 default: 9472 fatal("Unexpected type argument %s", type2name(type)); break; 9473 } 9474 } 9475 9476 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9477 switch(type) { 9478 case T_INT: 9479 evpxord(dst, mask, nds, src, merge, vector_len); break; 9480 case T_LONG: 9481 evpxorq(dst, mask, nds, src, merge, vector_len); break; 9482 default: 9483 fatal("Unexpected type argument %s", type2name(type)); break; 9484 } 9485 } 9486 9487 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9488 switch(type) { 9489 case T_INT: 9490 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break; 9491 case T_LONG: 9492 evporq(dst, mask, nds, src, merge, vector_len); break; 9493 default: 9494 fatal("Unexpected type argument %s", type2name(type)); break; 9495 } 9496 } 9497 9498 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9499 switch(type) { 9500 case T_INT: 9501 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break; 9502 case T_LONG: 9503 evporq(dst, mask, nds, src, merge, vector_len); break; 9504 default: 9505 fatal("Unexpected type argument %s", type2name(type)); break; 9506 } 9507 } 9508 9509 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9510 switch(type) { 9511 case T_INT: 9512 evpandd(dst, mask, nds, src, merge, vector_len); break; 9513 case T_LONG: 9514 evpandq(dst, mask, nds, src, merge, vector_len); break; 9515 default: 9516 fatal("Unexpected type argument %s", type2name(type)); break; 9517 } 9518 } 9519 9520 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9521 switch(type) { 9522 case T_INT: 9523 evpandd(dst, mask, nds, src, merge, vector_len); break; 9524 case T_LONG: 9525 evpandq(dst, mask, nds, src, merge, vector_len); break; 9526 default: 9527 fatal("Unexpected type argument %s", type2name(type)); break; 9528 } 9529 } 9530 9531 void MacroAssembler::anytrue(Register dst, uint masklen, KRegister src1, KRegister src2) { 9532 masklen = masklen < 8 ? 8 : masklen; 9533 ktest(masklen, src1, src2); 9534 setb(Assembler::notZero, dst); 9535 movzbl(dst, dst); 9536 } 9537 9538 void MacroAssembler::alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch) { 9539 if (masklen < 8) { 9540 knotbl(kscratch, src2); 9541 kortestbl(src1, kscratch); 9542 setb(Assembler::carrySet, dst); 9543 movzbl(dst, dst); 9544 } else { 9545 ktest(masklen, src1, src2); 9546 setb(Assembler::carrySet, dst); 9547 movzbl(dst, dst); 9548 } 9549 } 9550 9551 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) { 9552 switch(masklen) { 9553 case 8: 9554 kortestbl(src1, src2); 9555 break; 9556 case 16: 9557 kortestwl(src1, src2); 9558 break; 9559 case 32: 9560 kortestdl(src1, src2); 9561 break; 9562 case 64: 9563 kortestql(src1, src2); 9564 break; 9565 default: 9566 fatal("Unexpected mask length %d", masklen); 9567 break; 9568 } 9569 } 9570 9571 9572 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) { 9573 switch(masklen) { 9574 case 8: 9575 ktestbl(src1, src2); 9576 break; 9577 case 16: 9578 ktestwl(src1, src2); 9579 break; 9580 case 32: 9581 ktestdl(src1, src2); 9582 break; 9583 case 64: 9584 ktestql(src1, src2); 9585 break; 9586 default: 9587 fatal("Unexpected mask length %d", masklen); 9588 break; 9589 } 9590 } 9591 9592 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) { 9593 switch(type) { 9594 case T_INT: 9595 evprold(dst, mask, src, shift, merge, vlen_enc); break; 9596 case T_LONG: 9597 evprolq(dst, mask, src, shift, merge, vlen_enc); break; 9598 default: 9599 fatal("Unexpected type argument %s", type2name(type)); break; 9600 break; 9601 } 9602 } 9603 9604 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) { 9605 switch(type) { 9606 case T_INT: 9607 evprord(dst, mask, src, shift, merge, vlen_enc); break; 9608 case T_LONG: 9609 evprorq(dst, mask, src, shift, merge, vlen_enc); break; 9610 default: 9611 fatal("Unexpected type argument %s", type2name(type)); break; 9612 } 9613 } 9614 9615 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 9616 switch(type) { 9617 case T_INT: 9618 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break; 9619 case T_LONG: 9620 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break; 9621 default: 9622 fatal("Unexpected type argument %s", type2name(type)); break; 9623 } 9624 } 9625 9626 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 9627 switch(type) { 9628 case T_INT: 9629 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break; 9630 case T_LONG: 9631 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break; 9632 default: 9633 fatal("Unexpected type argument %s", type2name(type)); break; 9634 } 9635 } 9636 #if COMPILER2_OR_JVMCI 9637 9638 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask, 9639 Register length, Register temp, int vec_enc) { 9640 // Computing mask for predicated vector store. 9641 movptr(temp, -1); 9642 bzhiq(temp, temp, length); 9643 kmov(mask, temp); 9644 evmovdqu(bt, mask, dst, xmm, vec_enc); 9645 } 9646 9647 // Set memory operation for length "less than" 64 bytes. 9648 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp, 9649 XMMRegister xmm, KRegister mask, Register length, 9650 Register temp, bool use64byteVector) { 9651 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 9652 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 9653 if (!use64byteVector) { 9654 fill32(dst, disp, xmm); 9655 subptr(length, 32 >> shift); 9656 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp); 9657 } else { 9658 assert(MaxVectorSize == 64, "vector length != 64"); 9659 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit); 9660 } 9661 } 9662 9663 9664 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp, 9665 XMMRegister xmm, KRegister mask, Register length, 9666 Register temp) { 9667 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 9668 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 9669 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit); 9670 } 9671 9672 9673 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) { 9674 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 9675 vmovdqu(Address(dst, disp), xmm); 9676 } 9677 9678 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) { 9679 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 9680 BasicType type[] = {T_BYTE, T_SHORT, T_INT, T_LONG}; 9681 if (!use64byteVector) { 9682 fill32(dst, disp, xmm); 9683 fill32(dst, disp + 32, xmm); 9684 } else { 9685 evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit); 9686 } 9687 } 9688 9689 #ifdef _LP64 9690 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value, 9691 Register count, Register rtmp, XMMRegister xtmp) { 9692 Label L_exit; 9693 Label L_fill_start; 9694 Label L_fill_64_bytes; 9695 Label L_fill_96_bytes; 9696 Label L_fill_128_bytes; 9697 Label L_fill_128_bytes_loop; 9698 Label L_fill_128_loop_header; 9699 Label L_fill_128_bytes_loop_header; 9700 Label L_fill_128_bytes_loop_pre_header; 9701 Label L_fill_zmm_sequence; 9702 9703 int shift = -1; 9704 int avx3threshold = VM_Version::avx3_threshold(); 9705 switch(type) { 9706 case T_BYTE: shift = 0; 9707 break; 9708 case T_SHORT: shift = 1; 9709 break; 9710 case T_INT: shift = 2; 9711 break; 9712 /* Uncomment when LONG fill stubs are supported. 9713 case T_LONG: shift = 3; 9714 break; 9715 */ 9716 default: 9717 fatal("Unhandled type: %s\n", type2name(type)); 9718 } 9719 9720 if ((avx3threshold != 0) || (MaxVectorSize == 32)) { 9721 9722 if (MaxVectorSize == 64) { 9723 cmpq(count, avx3threshold >> shift); 9724 jcc(Assembler::greater, L_fill_zmm_sequence); 9725 } 9726 9727 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit); 9728 9729 bind(L_fill_start); 9730 9731 cmpq(count, 32 >> shift); 9732 jccb(Assembler::greater, L_fill_64_bytes); 9733 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp); 9734 jmp(L_exit); 9735 9736 bind(L_fill_64_bytes); 9737 cmpq(count, 64 >> shift); 9738 jccb(Assembler::greater, L_fill_96_bytes); 9739 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp); 9740 jmp(L_exit); 9741 9742 bind(L_fill_96_bytes); 9743 cmpq(count, 96 >> shift); 9744 jccb(Assembler::greater, L_fill_128_bytes); 9745 fill64(to, 0, xtmp); 9746 subq(count, 64 >> shift); 9747 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp); 9748 jmp(L_exit); 9749 9750 bind(L_fill_128_bytes); 9751 cmpq(count, 128 >> shift); 9752 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header); 9753 fill64(to, 0, xtmp); 9754 fill32(to, 64, xtmp); 9755 subq(count, 96 >> shift); 9756 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp); 9757 jmp(L_exit); 9758 9759 bind(L_fill_128_bytes_loop_pre_header); 9760 { 9761 mov(rtmp, to); 9762 andq(rtmp, 31); 9763 jccb(Assembler::zero, L_fill_128_bytes_loop_header); 9764 negq(rtmp); 9765 addq(rtmp, 32); 9766 mov64(r8, -1L); 9767 bzhiq(r8, r8, rtmp); 9768 kmovql(k2, r8); 9769 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_256bit); 9770 addq(to, rtmp); 9771 shrq(rtmp, shift); 9772 subq(count, rtmp); 9773 } 9774 9775 cmpq(count, 128 >> shift); 9776 jcc(Assembler::less, L_fill_start); 9777 9778 bind(L_fill_128_bytes_loop_header); 9779 subq(count, 128 >> shift); 9780 9781 align32(); 9782 bind(L_fill_128_bytes_loop); 9783 fill64(to, 0, xtmp); 9784 fill64(to, 64, xtmp); 9785 addq(to, 128); 9786 subq(count, 128 >> shift); 9787 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop); 9788 9789 addq(count, 128 >> shift); 9790 jcc(Assembler::zero, L_exit); 9791 jmp(L_fill_start); 9792 } 9793 9794 if (MaxVectorSize == 64) { 9795 // Sequence using 64 byte ZMM register. 9796 Label L_fill_128_bytes_zmm; 9797 Label L_fill_192_bytes_zmm; 9798 Label L_fill_192_bytes_loop_zmm; 9799 Label L_fill_192_bytes_loop_header_zmm; 9800 Label L_fill_192_bytes_loop_pre_header_zmm; 9801 Label L_fill_start_zmm_sequence; 9802 9803 bind(L_fill_zmm_sequence); 9804 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit); 9805 9806 bind(L_fill_start_zmm_sequence); 9807 cmpq(count, 64 >> shift); 9808 jccb(Assembler::greater, L_fill_128_bytes_zmm); 9809 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true); 9810 jmp(L_exit); 9811 9812 bind(L_fill_128_bytes_zmm); 9813 cmpq(count, 128 >> shift); 9814 jccb(Assembler::greater, L_fill_192_bytes_zmm); 9815 fill64(to, 0, xtmp, true); 9816 subq(count, 64 >> shift); 9817 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true); 9818 jmp(L_exit); 9819 9820 bind(L_fill_192_bytes_zmm); 9821 cmpq(count, 192 >> shift); 9822 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); 9823 fill64(to, 0, xtmp, true); 9824 fill64(to, 64, xtmp, true); 9825 subq(count, 128 >> shift); 9826 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true); 9827 jmp(L_exit); 9828 9829 bind(L_fill_192_bytes_loop_pre_header_zmm); 9830 { 9831 movq(rtmp, to); 9832 andq(rtmp, 63); 9833 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm); 9834 negq(rtmp); 9835 addq(rtmp, 64); 9836 mov64(r8, -1L); 9837 bzhiq(r8, r8, rtmp); 9838 kmovql(k2, r8); 9839 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_512bit); 9840 addq(to, rtmp); 9841 shrq(rtmp, shift); 9842 subq(count, rtmp); 9843 } 9844 9845 cmpq(count, 192 >> shift); 9846 jcc(Assembler::less, L_fill_start_zmm_sequence); 9847 9848 bind(L_fill_192_bytes_loop_header_zmm); 9849 subq(count, 192 >> shift); 9850 9851 align32(); 9852 bind(L_fill_192_bytes_loop_zmm); 9853 fill64(to, 0, xtmp, true); 9854 fill64(to, 64, xtmp, true); 9855 fill64(to, 128, xtmp, true); 9856 addq(to, 192); 9857 subq(count, 192 >> shift); 9858 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm); 9859 9860 addq(count, 192 >> shift); 9861 jcc(Assembler::zero, L_exit); 9862 jmp(L_fill_start_zmm_sequence); 9863 } 9864 bind(L_exit); 9865 } 9866 #endif 9867 #endif //COMPILER2_OR_JVMCI 9868 9869 9870 #ifdef _LP64 9871 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) { 9872 Label done; 9873 cvttss2sil(dst, src); 9874 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 9875 cmpl(dst, 0x80000000); // float_sign_flip 9876 jccb(Assembler::notEqual, done); 9877 subptr(rsp, 8); 9878 movflt(Address(rsp, 0), src); 9879 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup()))); 9880 pop(dst); 9881 bind(done); 9882 } 9883 9884 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) { 9885 Label done; 9886 cvttsd2sil(dst, src); 9887 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 9888 cmpl(dst, 0x80000000); // float_sign_flip 9889 jccb(Assembler::notEqual, done); 9890 subptr(rsp, 8); 9891 movdbl(Address(rsp, 0), src); 9892 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup()))); 9893 pop(dst); 9894 bind(done); 9895 } 9896 9897 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) { 9898 Label done; 9899 cvttss2siq(dst, src); 9900 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 9901 jccb(Assembler::notEqual, done); 9902 subptr(rsp, 8); 9903 movflt(Address(rsp, 0), src); 9904 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup()))); 9905 pop(dst); 9906 bind(done); 9907 } 9908 9909 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) { 9910 Label done; 9911 cvttsd2siq(dst, src); 9912 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 9913 jccb(Assembler::notEqual, done); 9914 subptr(rsp, 8); 9915 movdbl(Address(rsp, 0), src); 9916 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup()))); 9917 pop(dst); 9918 bind(done); 9919 } 9920 9921 void MacroAssembler::cache_wb(Address line) 9922 { 9923 // 64 bit cpus always support clflush 9924 assert(VM_Version::supports_clflush(), "clflush should be available"); 9925 bool optimized = VM_Version::supports_clflushopt(); 9926 bool no_evict = VM_Version::supports_clwb(); 9927 9928 // prefer clwb (writeback without evict) otherwise 9929 // prefer clflushopt (potentially parallel writeback with evict) 9930 // otherwise fallback on clflush (serial writeback with evict) 9931 9932 if (optimized) { 9933 if (no_evict) { 9934 clwb(line); 9935 } else { 9936 clflushopt(line); 9937 } 9938 } else { 9939 // no need for fence when using CLFLUSH 9940 clflush(line); 9941 } 9942 } 9943 9944 void MacroAssembler::cache_wbsync(bool is_pre) 9945 { 9946 assert(VM_Version::supports_clflush(), "clflush should be available"); 9947 bool optimized = VM_Version::supports_clflushopt(); 9948 bool no_evict = VM_Version::supports_clwb(); 9949 9950 // pick the correct implementation 9951 9952 if (!is_pre && (optimized || no_evict)) { 9953 // need an sfence for post flush when using clflushopt or clwb 9954 // otherwise no no need for any synchroniaztion 9955 9956 sfence(); 9957 } 9958 } 9959 9960 #endif // _LP64 9961 9962 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { 9963 switch (cond) { 9964 // Note some conditions are synonyms for others 9965 case Assembler::zero: return Assembler::notZero; 9966 case Assembler::notZero: return Assembler::zero; 9967 case Assembler::less: return Assembler::greaterEqual; 9968 case Assembler::lessEqual: return Assembler::greater; 9969 case Assembler::greater: return Assembler::lessEqual; 9970 case Assembler::greaterEqual: return Assembler::less; 9971 case Assembler::below: return Assembler::aboveEqual; 9972 case Assembler::belowEqual: return Assembler::above; 9973 case Assembler::above: return Assembler::belowEqual; 9974 case Assembler::aboveEqual: return Assembler::below; 9975 case Assembler::overflow: return Assembler::noOverflow; 9976 case Assembler::noOverflow: return Assembler::overflow; 9977 case Assembler::negative: return Assembler::positive; 9978 case Assembler::positive: return Assembler::negative; 9979 case Assembler::parity: return Assembler::noParity; 9980 case Assembler::noParity: return Assembler::parity; 9981 } 9982 ShouldNotReachHere(); return Assembler::overflow; 9983 } 9984 9985 SkipIfEqual::SkipIfEqual( 9986 MacroAssembler* masm, const bool* flag_addr, bool value) { 9987 _masm = masm; 9988 _masm->cmp8(ExternalAddress((address)flag_addr), value); 9989 _masm->jcc(Assembler::equal, _label); 9990 } 9991 9992 SkipIfEqual::~SkipIfEqual() { 9993 _masm->bind(_label); 9994 } 9995 9996 // 32-bit Windows has its own fast-path implementation 9997 // of get_thread 9998 #if !defined(WIN32) || defined(_LP64) 9999 10000 // This is simply a call to Thread::current() 10001 void MacroAssembler::get_thread(Register thread) { 10002 if (thread != rax) { 10003 push(rax); 10004 } 10005 LP64_ONLY(push(rdi);) 10006 LP64_ONLY(push(rsi);) 10007 push(rdx); 10008 push(rcx); 10009 #ifdef _LP64 10010 push(r8); 10011 push(r9); 10012 push(r10); 10013 push(r11); 10014 #endif 10015 10016 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0); 10017 10018 #ifdef _LP64 10019 pop(r11); 10020 pop(r10); 10021 pop(r9); 10022 pop(r8); 10023 #endif 10024 pop(rcx); 10025 pop(rdx); 10026 LP64_ONLY(pop(rsi);) 10027 LP64_ONLY(pop(rdi);) 10028 if (thread != rax) { 10029 mov(thread, rax); 10030 pop(rax); 10031 } 10032 } 10033 10034 10035 #endif // !WIN32 || _LP64