1 /* 2 * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "compiler/compiler_globals.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "crc32c.h" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/collectedHeap.inline.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/bytecodeHistogram.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "jvm.h" 38 #include "memory/resourceArea.hpp" 39 #include "memory/universe.hpp" 40 #include "oops/accessDecorators.hpp" 41 #include "oops/compressedKlass.inline.hpp" 42 #include "oops/compressedOops.inline.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "prims/methodHandles.hpp" 45 #include "runtime/continuation.hpp" 46 #include "runtime/flags/flagSetting.hpp" 47 #include "runtime/interfaceSupport.inline.hpp" 48 #include "runtime/javaThread.hpp" 49 #include "runtime/jniHandles.hpp" 50 #include "runtime/objectMonitor.hpp" 51 #include "runtime/os.hpp" 52 #include "runtime/safepoint.hpp" 53 #include "runtime/safepointMechanism.hpp" 54 #include "runtime/sharedRuntime.hpp" 55 #include "runtime/stubRoutines.hpp" 56 #include "utilities/macros.hpp" 57 58 #ifdef PRODUCT 59 #define BLOCK_COMMENT(str) /* nothing */ 60 #define STOP(error) stop(error) 61 #else 62 #define BLOCK_COMMENT(str) block_comment(str) 63 #define STOP(error) block_comment(error); stop(error) 64 #endif 65 66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 67 68 #ifdef ASSERT 69 bool AbstractAssembler::pd_check_instruction_mark() { return true; } 70 #endif 71 72 static Assembler::Condition reverse[] = { 73 Assembler::noOverflow /* overflow = 0x0 */ , 74 Assembler::overflow /* noOverflow = 0x1 */ , 75 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ , 76 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ , 77 Assembler::notZero /* zero = 0x4, equal = 0x4 */ , 78 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ , 79 Assembler::above /* belowEqual = 0x6 */ , 80 Assembler::belowEqual /* above = 0x7 */ , 81 Assembler::positive /* negative = 0x8 */ , 82 Assembler::negative /* positive = 0x9 */ , 83 Assembler::noParity /* parity = 0xa */ , 84 Assembler::parity /* noParity = 0xb */ , 85 Assembler::greaterEqual /* less = 0xc */ , 86 Assembler::less /* greaterEqual = 0xd */ , 87 Assembler::greater /* lessEqual = 0xe */ , 88 Assembler::lessEqual /* greater = 0xf, */ 89 90 }; 91 92 93 // Implementation of MacroAssembler 94 95 // First all the versions that have distinct versions depending on 32/64 bit 96 // Unless the difference is trivial (1 line or so). 97 98 #ifndef _LP64 99 100 // 32bit versions 101 102 Address MacroAssembler::as_Address(AddressLiteral adr) { 103 return Address(adr.target(), adr.rspec()); 104 } 105 106 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) { 107 assert(rscratch == noreg, ""); 108 return Address::make_array(adr); 109 } 110 111 void MacroAssembler::call_VM_leaf_base(address entry_point, 112 int number_of_arguments) { 113 call(RuntimeAddress(entry_point)); 114 increment(rsp, number_of_arguments * wordSize); 115 } 116 117 void MacroAssembler::cmpklass(Address src1, Metadata* obj) { 118 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 119 } 120 121 122 void MacroAssembler::cmpklass(Register src1, Metadata* obj) { 123 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 124 } 125 126 void MacroAssembler::cmpoop(Address src1, jobject obj) { 127 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 128 } 129 130 void MacroAssembler::cmpoop(Register src1, jobject obj, Register rscratch) { 131 assert(rscratch == noreg, "redundant"); 132 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 133 } 134 135 void MacroAssembler::extend_sign(Register hi, Register lo) { 136 // According to Intel Doc. AP-526, "Integer Divide", p.18. 137 if (VM_Version::is_P6() && hi == rdx && lo == rax) { 138 cdql(); 139 } else { 140 movl(hi, lo); 141 sarl(hi, 31); 142 } 143 } 144 145 void MacroAssembler::jC2(Register tmp, Label& L) { 146 // set parity bit if FPU flag C2 is set (via rax) 147 save_rax(tmp); 148 fwait(); fnstsw_ax(); 149 sahf(); 150 restore_rax(tmp); 151 // branch 152 jcc(Assembler::parity, L); 153 } 154 155 void MacroAssembler::jnC2(Register tmp, Label& L) { 156 // set parity bit if FPU flag C2 is set (via rax) 157 save_rax(tmp); 158 fwait(); fnstsw_ax(); 159 sahf(); 160 restore_rax(tmp); 161 // branch 162 jcc(Assembler::noParity, L); 163 } 164 165 // 32bit can do a case table jump in one instruction but we no longer allow the base 166 // to be installed in the Address class 167 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) { 168 assert(rscratch == noreg, "not needed"); 169 jmp(as_Address(entry, noreg)); 170 } 171 172 // Note: y_lo will be destroyed 173 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 174 // Long compare for Java (semantics as described in JVM spec.) 175 Label high, low, done; 176 177 cmpl(x_hi, y_hi); 178 jcc(Assembler::less, low); 179 jcc(Assembler::greater, high); 180 // x_hi is the return register 181 xorl(x_hi, x_hi); 182 cmpl(x_lo, y_lo); 183 jcc(Assembler::below, low); 184 jcc(Assembler::equal, done); 185 186 bind(high); 187 xorl(x_hi, x_hi); 188 increment(x_hi); 189 jmp(done); 190 191 bind(low); 192 xorl(x_hi, x_hi); 193 decrementl(x_hi); 194 195 bind(done); 196 } 197 198 void MacroAssembler::lea(Register dst, AddressLiteral src) { 199 mov_literal32(dst, (int32_t)src.target(), src.rspec()); 200 } 201 202 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) { 203 assert(rscratch == noreg, "not needed"); 204 205 // leal(dst, as_Address(adr)); 206 // see note in movl as to why we must use a move 207 mov_literal32(dst, (int32_t)adr.target(), adr.rspec()); 208 } 209 210 void MacroAssembler::leave() { 211 mov(rsp, rbp); 212 pop(rbp); 213 } 214 215 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) { 216 // Multiplication of two Java long values stored on the stack 217 // as illustrated below. Result is in rdx:rax. 218 // 219 // rsp ---> [ ?? ] \ \ 220 // .... | y_rsp_offset | 221 // [ y_lo ] / (in bytes) | x_rsp_offset 222 // [ y_hi ] | (in bytes) 223 // .... | 224 // [ x_lo ] / 225 // [ x_hi ] 226 // .... 227 // 228 // Basic idea: lo(result) = lo(x_lo * y_lo) 229 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) 230 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset); 231 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset); 232 Label quick; 233 // load x_hi, y_hi and check if quick 234 // multiplication is possible 235 movl(rbx, x_hi); 236 movl(rcx, y_hi); 237 movl(rax, rbx); 238 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0 239 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply 240 // do full multiplication 241 // 1st step 242 mull(y_lo); // x_hi * y_lo 243 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx, 244 // 2nd step 245 movl(rax, x_lo); 246 mull(rcx); // x_lo * y_hi 247 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx, 248 // 3rd step 249 bind(quick); // note: rbx, = 0 if quick multiply! 250 movl(rax, x_lo); 251 mull(y_lo); // x_lo * y_lo 252 addl(rdx, rbx); // correct hi(x_lo * y_lo) 253 } 254 255 void MacroAssembler::lneg(Register hi, Register lo) { 256 negl(lo); 257 adcl(hi, 0); 258 negl(hi); 259 } 260 261 void MacroAssembler::lshl(Register hi, Register lo) { 262 // Java shift left long support (semantics as described in JVM spec., p.305) 263 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) 264 // shift value is in rcx ! 265 assert(hi != rcx, "must not use rcx"); 266 assert(lo != rcx, "must not use rcx"); 267 const Register s = rcx; // shift count 268 const int n = BitsPerWord; 269 Label L; 270 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 271 cmpl(s, n); // if (s < n) 272 jcc(Assembler::less, L); // else (s >= n) 273 movl(hi, lo); // x := x << n 274 xorl(lo, lo); 275 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 276 bind(L); // s (mod n) < n 277 shldl(hi, lo); // x := x << s 278 shll(lo); 279 } 280 281 282 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) { 283 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) 284 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) 285 assert(hi != rcx, "must not use rcx"); 286 assert(lo != rcx, "must not use rcx"); 287 const Register s = rcx; // shift count 288 const int n = BitsPerWord; 289 Label L; 290 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 291 cmpl(s, n); // if (s < n) 292 jcc(Assembler::less, L); // else (s >= n) 293 movl(lo, hi); // x := x >> n 294 if (sign_extension) sarl(hi, 31); 295 else xorl(hi, hi); 296 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 297 bind(L); // s (mod n) < n 298 shrdl(lo, hi); // x := x >> s 299 if (sign_extension) sarl(hi); 300 else shrl(hi); 301 } 302 303 void MacroAssembler::movoop(Register dst, jobject obj) { 304 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 305 } 306 307 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) { 308 assert(rscratch == noreg, "redundant"); 309 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 310 } 311 312 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 313 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 314 } 315 316 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) { 317 assert(rscratch == noreg, "redundant"); 318 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 319 } 320 321 void MacroAssembler::movptr(Register dst, AddressLiteral src) { 322 if (src.is_lval()) { 323 mov_literal32(dst, (intptr_t)src.target(), src.rspec()); 324 } else { 325 movl(dst, as_Address(src)); 326 } 327 } 328 329 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) { 330 assert(rscratch == noreg, "redundant"); 331 movl(as_Address(dst, noreg), src); 332 } 333 334 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 335 movl(dst, as_Address(src, noreg)); 336 } 337 338 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) { 339 assert(rscratch == noreg, "redundant"); 340 movl(dst, src); 341 } 342 343 void MacroAssembler::pushoop(jobject obj, Register rscratch) { 344 assert(rscratch == noreg, "redundant"); 345 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate()); 346 } 347 348 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) { 349 assert(rscratch == noreg, "redundant"); 350 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate()); 351 } 352 353 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) { 354 assert(rscratch == noreg, "redundant"); 355 if (src.is_lval()) { 356 push_literal32((int32_t)src.target(), src.rspec()); 357 } else { 358 pushl(as_Address(src)); 359 } 360 } 361 362 static void pass_arg0(MacroAssembler* masm, Register arg) { 363 masm->push(arg); 364 } 365 366 static void pass_arg1(MacroAssembler* masm, Register arg) { 367 masm->push(arg); 368 } 369 370 static void pass_arg2(MacroAssembler* masm, Register arg) { 371 masm->push(arg); 372 } 373 374 static void pass_arg3(MacroAssembler* masm, Register arg) { 375 masm->push(arg); 376 } 377 378 #ifndef PRODUCT 379 extern "C" void findpc(intptr_t x); 380 #endif 381 382 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) { 383 // In order to get locks to work, we need to fake a in_VM state 384 JavaThread* thread = JavaThread::current(); 385 JavaThreadState saved_state = thread->thread_state(); 386 thread->set_thread_state(_thread_in_vm); 387 if (ShowMessageBoxOnError) { 388 JavaThread* thread = JavaThread::current(); 389 JavaThreadState saved_state = thread->thread_state(); 390 thread->set_thread_state(_thread_in_vm); 391 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 392 ttyLocker ttyl; 393 BytecodeCounter::print(); 394 } 395 // To see where a verify_oop failed, get $ebx+40/X for this frame. 396 // This is the value of eip which points to where verify_oop will return. 397 if (os::message_box(msg, "Execution stopped, print registers?")) { 398 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip); 399 BREAKPOINT; 400 } 401 } 402 fatal("DEBUG MESSAGE: %s", msg); 403 } 404 405 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) { 406 ttyLocker ttyl; 407 FlagSetting fs(Debugging, true); 408 tty->print_cr("eip = 0x%08x", eip); 409 #ifndef PRODUCT 410 if ((WizardMode || Verbose) && PrintMiscellaneous) { 411 tty->cr(); 412 findpc(eip); 413 tty->cr(); 414 } 415 #endif 416 #define PRINT_REG(rax) \ 417 { tty->print("%s = ", #rax); os::print_location(tty, rax); } 418 PRINT_REG(rax); 419 PRINT_REG(rbx); 420 PRINT_REG(rcx); 421 PRINT_REG(rdx); 422 PRINT_REG(rdi); 423 PRINT_REG(rsi); 424 PRINT_REG(rbp); 425 PRINT_REG(rsp); 426 #undef PRINT_REG 427 // Print some words near top of staack. 428 int* dump_sp = (int*) rsp; 429 for (int col1 = 0; col1 < 8; col1++) { 430 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 431 os::print_location(tty, *dump_sp++); 432 } 433 for (int row = 0; row < 16; row++) { 434 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 435 for (int col = 0; col < 8; col++) { 436 tty->print(" 0x%08x", *dump_sp++); 437 } 438 tty->cr(); 439 } 440 // Print some instructions around pc: 441 Disassembler::decode((address)eip-64, (address)eip); 442 tty->print_cr("--------"); 443 Disassembler::decode((address)eip, (address)eip+32); 444 } 445 446 void MacroAssembler::stop(const char* msg) { 447 // push address of message 448 ExternalAddress message((address)msg); 449 pushptr(message.addr(), noreg); 450 { Label L; call(L, relocInfo::none); bind(L); } // push eip 451 pusha(); // push registers 452 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32))); 453 hlt(); 454 } 455 456 void MacroAssembler::warn(const char* msg) { 457 push_CPU_state(); 458 459 // push address of message 460 ExternalAddress message((address)msg); 461 pushptr(message.addr(), noreg); 462 463 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); 464 addl(rsp, wordSize); // discard argument 465 pop_CPU_state(); 466 } 467 468 void MacroAssembler::print_state() { 469 { Label L; call(L, relocInfo::none); bind(L); } // push eip 470 pusha(); // push registers 471 472 push_CPU_state(); 473 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32))); 474 pop_CPU_state(); 475 476 popa(); 477 addl(rsp, wordSize); 478 } 479 480 #else // _LP64 481 482 // 64 bit versions 483 484 Address MacroAssembler::as_Address(AddressLiteral adr) { 485 // amd64 always does this as a pc-rel 486 // we can be absolute or disp based on the instruction type 487 // jmp/call are displacements others are absolute 488 assert(!adr.is_lval(), "must be rval"); 489 assert(reachable(adr), "must be"); 490 return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc()); 491 492 } 493 494 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) { 495 AddressLiteral base = adr.base(); 496 lea(rscratch, base); 497 Address index = adr.index(); 498 assert(index._disp == 0, "must not have disp"); // maybe it can? 499 Address array(rscratch, index._index, index._scale, index._disp); 500 return array; 501 } 502 503 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { 504 Label L, E; 505 506 #ifdef _WIN64 507 // Windows always allocates space for it's register args 508 assert(num_args <= 4, "only register arguments supported"); 509 subq(rsp, frame::arg_reg_save_area_bytes); 510 #endif 511 512 // Align stack if necessary 513 testl(rsp, 15); 514 jcc(Assembler::zero, L); 515 516 subq(rsp, 8); 517 call(RuntimeAddress(entry_point)); 518 addq(rsp, 8); 519 jmp(E); 520 521 bind(L); 522 call(RuntimeAddress(entry_point)); 523 524 bind(E); 525 526 #ifdef _WIN64 527 // restore stack pointer 528 addq(rsp, frame::arg_reg_save_area_bytes); 529 #endif 530 531 } 532 533 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) { 534 assert(!src2.is_lval(), "should use cmpptr"); 535 assert(rscratch != noreg || always_reachable(src2), "missing"); 536 537 if (reachable(src2)) { 538 cmpq(src1, as_Address(src2)); 539 } else { 540 lea(rscratch, src2); 541 Assembler::cmpq(src1, Address(rscratch, 0)); 542 } 543 } 544 545 int MacroAssembler::corrected_idivq(Register reg) { 546 // Full implementation of Java ldiv and lrem; checks for special 547 // case as described in JVM spec., p.243 & p.271. The function 548 // returns the (pc) offset of the idivl instruction - may be needed 549 // for implicit exceptions. 550 // 551 // normal case special case 552 // 553 // input : rax: dividend min_long 554 // reg: divisor (may not be eax/edx) -1 555 // 556 // output: rax: quotient (= rax idiv reg) min_long 557 // rdx: remainder (= rax irem reg) 0 558 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register"); 559 static const int64_t min_long = 0x8000000000000000; 560 Label normal_case, special_case; 561 562 // check for special case 563 cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/); 564 jcc(Assembler::notEqual, normal_case); 565 xorl(rdx, rdx); // prepare rdx for possible special case (where 566 // remainder = 0) 567 cmpq(reg, -1); 568 jcc(Assembler::equal, special_case); 569 570 // handle normal case 571 bind(normal_case); 572 cdqq(); 573 int idivq_offset = offset(); 574 idivq(reg); 575 576 // normal and special case exit 577 bind(special_case); 578 579 return idivq_offset; 580 } 581 582 void MacroAssembler::decrementq(Register reg, int value) { 583 if (value == min_jint) { subq(reg, value); return; } 584 if (value < 0) { incrementq(reg, -value); return; } 585 if (value == 0) { ; return; } 586 if (value == 1 && UseIncDec) { decq(reg) ; return; } 587 /* else */ { subq(reg, value) ; return; } 588 } 589 590 void MacroAssembler::decrementq(Address dst, int value) { 591 if (value == min_jint) { subq(dst, value); return; } 592 if (value < 0) { incrementq(dst, -value); return; } 593 if (value == 0) { ; return; } 594 if (value == 1 && UseIncDec) { decq(dst) ; return; } 595 /* else */ { subq(dst, value) ; return; } 596 } 597 598 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) { 599 assert(rscratch != noreg || always_reachable(dst), "missing"); 600 601 if (reachable(dst)) { 602 incrementq(as_Address(dst)); 603 } else { 604 lea(rscratch, dst); 605 incrementq(Address(rscratch, 0)); 606 } 607 } 608 609 void MacroAssembler::incrementq(Register reg, int value) { 610 if (value == min_jint) { addq(reg, value); return; } 611 if (value < 0) { decrementq(reg, -value); return; } 612 if (value == 0) { ; return; } 613 if (value == 1 && UseIncDec) { incq(reg) ; return; } 614 /* else */ { addq(reg, value) ; return; } 615 } 616 617 void MacroAssembler::incrementq(Address dst, int value) { 618 if (value == min_jint) { addq(dst, value); return; } 619 if (value < 0) { decrementq(dst, -value); return; } 620 if (value == 0) { ; return; } 621 if (value == 1 && UseIncDec) { incq(dst) ; return; } 622 /* else */ { addq(dst, value) ; return; } 623 } 624 625 // 32bit can do a case table jump in one instruction but we no longer allow the base 626 // to be installed in the Address class 627 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) { 628 lea(rscratch, entry.base()); 629 Address dispatch = entry.index(); 630 assert(dispatch._base == noreg, "must be"); 631 dispatch._base = rscratch; 632 jmp(dispatch); 633 } 634 635 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 636 ShouldNotReachHere(); // 64bit doesn't use two regs 637 cmpq(x_lo, y_lo); 638 } 639 640 void MacroAssembler::lea(Register dst, AddressLiteral src) { 641 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 642 } 643 644 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) { 645 lea(rscratch, adr); 646 movptr(dst, rscratch); 647 } 648 649 void MacroAssembler::leave() { 650 // %%% is this really better? Why not on 32bit too? 651 emit_int8((unsigned char)0xC9); // LEAVE 652 } 653 654 void MacroAssembler::lneg(Register hi, Register lo) { 655 ShouldNotReachHere(); // 64bit doesn't use two regs 656 negq(lo); 657 } 658 659 void MacroAssembler::movoop(Register dst, jobject obj) { 660 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 661 } 662 663 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) { 664 mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 665 movq(dst, rscratch); 666 } 667 668 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 669 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 670 } 671 672 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) { 673 mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 674 movq(dst, rscratch); 675 } 676 677 void MacroAssembler::movptr(Register dst, AddressLiteral src) { 678 if (src.is_lval()) { 679 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 680 } else { 681 if (reachable(src)) { 682 movq(dst, as_Address(src)); 683 } else { 684 lea(dst, src); 685 movq(dst, Address(dst, 0)); 686 } 687 } 688 } 689 690 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) { 691 movq(as_Address(dst, rscratch), src); 692 } 693 694 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 695 movq(dst, as_Address(src, dst /*rscratch*/)); 696 } 697 698 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 699 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) { 700 if (is_simm32(src)) { 701 movptr(dst, checked_cast<int32_t>(src)); 702 } else { 703 mov64(rscratch, src); 704 movq(dst, rscratch); 705 } 706 } 707 708 void MacroAssembler::pushoop(jobject obj, Register rscratch) { 709 movoop(rscratch, obj); 710 push(rscratch); 711 } 712 713 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) { 714 mov_metadata(rscratch, obj); 715 push(rscratch); 716 } 717 718 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) { 719 lea(rscratch, src); 720 if (src.is_lval()) { 721 push(rscratch); 722 } else { 723 pushq(Address(rscratch, 0)); 724 } 725 } 726 727 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 728 reset_last_Java_frame(r15_thread, clear_fp); 729 } 730 731 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 732 Register last_java_fp, 733 address last_java_pc, 734 Register rscratch) { 735 set_last_Java_frame(r15_thread, last_java_sp, last_java_fp, last_java_pc, rscratch); 736 } 737 738 static void pass_arg0(MacroAssembler* masm, Register arg) { 739 if (c_rarg0 != arg ) { 740 masm->mov(c_rarg0, arg); 741 } 742 } 743 744 static void pass_arg1(MacroAssembler* masm, Register arg) { 745 if (c_rarg1 != arg ) { 746 masm->mov(c_rarg1, arg); 747 } 748 } 749 750 static void pass_arg2(MacroAssembler* masm, Register arg) { 751 if (c_rarg2 != arg ) { 752 masm->mov(c_rarg2, arg); 753 } 754 } 755 756 static void pass_arg3(MacroAssembler* masm, Register arg) { 757 if (c_rarg3 != arg ) { 758 masm->mov(c_rarg3, arg); 759 } 760 } 761 762 void MacroAssembler::stop(const char* msg) { 763 if (ShowMessageBoxOnError) { 764 address rip = pc(); 765 pusha(); // get regs on stack 766 lea(c_rarg1, InternalAddress(rip)); 767 movq(c_rarg2, rsp); // pass pointer to regs array 768 } 769 lea(c_rarg0, ExternalAddress((address) msg)); 770 andq(rsp, -16); // align stack as required by ABI 771 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); 772 hlt(); 773 } 774 775 void MacroAssembler::warn(const char* msg) { 776 push(rbp); 777 movq(rbp, rsp); 778 andq(rsp, -16); // align stack as required by push_CPU_state and call 779 push_CPU_state(); // keeps alignment at 16 bytes 780 781 lea(c_rarg0, ExternalAddress((address) msg)); 782 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); 783 784 pop_CPU_state(); 785 mov(rsp, rbp); 786 pop(rbp); 787 } 788 789 void MacroAssembler::print_state() { 790 address rip = pc(); 791 pusha(); // get regs on stack 792 push(rbp); 793 movq(rbp, rsp); 794 andq(rsp, -16); // align stack as required by push_CPU_state and call 795 push_CPU_state(); // keeps alignment at 16 bytes 796 797 lea(c_rarg0, InternalAddress(rip)); 798 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array 799 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1); 800 801 pop_CPU_state(); 802 mov(rsp, rbp); 803 pop(rbp); 804 popa(); 805 } 806 807 #ifndef PRODUCT 808 extern "C" void findpc(intptr_t x); 809 #endif 810 811 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) { 812 // In order to get locks to work, we need to fake a in_VM state 813 if (ShowMessageBoxOnError) { 814 JavaThread* thread = JavaThread::current(); 815 JavaThreadState saved_state = thread->thread_state(); 816 thread->set_thread_state(_thread_in_vm); 817 #ifndef PRODUCT 818 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 819 ttyLocker ttyl; 820 BytecodeCounter::print(); 821 } 822 #endif 823 // To see where a verify_oop failed, get $ebx+40/X for this frame. 824 // XXX correct this offset for amd64 825 // This is the value of eip which points to where verify_oop will return. 826 if (os::message_box(msg, "Execution stopped, print registers?")) { 827 print_state64(pc, regs); 828 BREAKPOINT; 829 } 830 } 831 fatal("DEBUG MESSAGE: %s", msg); 832 } 833 834 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) { 835 ttyLocker ttyl; 836 FlagSetting fs(Debugging, true); 837 tty->print_cr("rip = 0x%016lx", (intptr_t)pc); 838 #ifndef PRODUCT 839 tty->cr(); 840 findpc(pc); 841 tty->cr(); 842 #endif 843 #define PRINT_REG(rax, value) \ 844 { tty->print("%s = ", #rax); os::print_location(tty, value); } 845 PRINT_REG(rax, regs[15]); 846 PRINT_REG(rbx, regs[12]); 847 PRINT_REG(rcx, regs[14]); 848 PRINT_REG(rdx, regs[13]); 849 PRINT_REG(rdi, regs[8]); 850 PRINT_REG(rsi, regs[9]); 851 PRINT_REG(rbp, regs[10]); 852 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp 853 PRINT_REG(rsp, (intptr_t)(®s[16])); 854 PRINT_REG(r8 , regs[7]); 855 PRINT_REG(r9 , regs[6]); 856 PRINT_REG(r10, regs[5]); 857 PRINT_REG(r11, regs[4]); 858 PRINT_REG(r12, regs[3]); 859 PRINT_REG(r13, regs[2]); 860 PRINT_REG(r14, regs[1]); 861 PRINT_REG(r15, regs[0]); 862 #undef PRINT_REG 863 // Print some words near the top of the stack. 864 int64_t* rsp = ®s[16]; 865 int64_t* dump_sp = rsp; 866 for (int col1 = 0; col1 < 8; col1++) { 867 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 868 os::print_location(tty, *dump_sp++); 869 } 870 for (int row = 0; row < 25; row++) { 871 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 872 for (int col = 0; col < 4; col++) { 873 tty->print(" 0x%016lx", (intptr_t)*dump_sp++); 874 } 875 tty->cr(); 876 } 877 // Print some instructions around pc: 878 Disassembler::decode((address)pc-64, (address)pc); 879 tty->print_cr("--------"); 880 Disassembler::decode((address)pc, (address)pc+32); 881 } 882 883 // The java_calling_convention describes stack locations as ideal slots on 884 // a frame with no abi restrictions. Since we must observe abi restrictions 885 // (like the placement of the register window) the slots must be biased by 886 // the following value. 887 static int reg2offset_in(VMReg r) { 888 // Account for saved rbp and return address 889 // This should really be in_preserve_stack_slots 890 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size; 891 } 892 893 static int reg2offset_out(VMReg r) { 894 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size; 895 } 896 897 // A long move 898 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 899 900 // The calling conventions assures us that each VMregpair is either 901 // all really one physical register or adjacent stack slots. 902 903 if (src.is_single_phys_reg() ) { 904 if (dst.is_single_phys_reg()) { 905 if (dst.first() != src.first()) { 906 mov(dst.first()->as_Register(), src.first()->as_Register()); 907 } 908 } else { 909 assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)", 910 src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name()); 911 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register()); 912 } 913 } else if (dst.is_single_phys_reg()) { 914 assert(src.is_single_reg(), "not a stack pair"); 915 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 916 } else { 917 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs"); 918 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 919 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 920 } 921 } 922 923 // A double move 924 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 925 926 // The calling conventions assures us that each VMregpair is either 927 // all really one physical register or adjacent stack slots. 928 929 if (src.is_single_phys_reg() ) { 930 if (dst.is_single_phys_reg()) { 931 // In theory these overlap but the ordering is such that this is likely a nop 932 if ( src.first() != dst.first()) { 933 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister()); 934 } 935 } else { 936 assert(dst.is_single_reg(), "not a stack pair"); 937 movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister()); 938 } 939 } else if (dst.is_single_phys_reg()) { 940 assert(src.is_single_reg(), "not a stack pair"); 941 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 942 } else { 943 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs"); 944 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 945 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 946 } 947 } 948 949 950 // A float arg may have to do float reg int reg conversion 951 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 952 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move"); 953 954 // The calling conventions assures us that each VMregpair is either 955 // all really one physical register or adjacent stack slots. 956 957 if (src.first()->is_stack()) { 958 if (dst.first()->is_stack()) { 959 movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 960 movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 961 } else { 962 // stack to reg 963 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters"); 964 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 965 } 966 } else if (dst.first()->is_stack()) { 967 // reg to stack 968 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters"); 969 movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister()); 970 } else { 971 // reg to reg 972 // In theory these overlap but the ordering is such that this is likely a nop 973 if ( src.first() != dst.first()) { 974 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister()); 975 } 976 } 977 } 978 979 // On 64 bit we will store integer like items to the stack as 980 // 64 bits items (x86_32/64 abi) even though java would only store 981 // 32bits for a parameter. On 32bit it will simply be 32 bits 982 // So this routine will do 32->32 on 32bit and 32->64 on 64bit 983 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 984 if (src.first()->is_stack()) { 985 if (dst.first()->is_stack()) { 986 // stack to stack 987 movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 988 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 989 } else { 990 // stack to reg 991 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 992 } 993 } else if (dst.first()->is_stack()) { 994 // reg to stack 995 // Do we really have to sign extend??? 996 // __ movslq(src.first()->as_Register(), src.first()->as_Register()); 997 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register()); 998 } else { 999 // Do we really have to sign extend??? 1000 // __ movslq(dst.first()->as_Register(), src.first()->as_Register()); 1001 if (dst.first() != src.first()) { 1002 movq(dst.first()->as_Register(), src.first()->as_Register()); 1003 } 1004 } 1005 } 1006 1007 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) { 1008 if (src.first()->is_stack()) { 1009 if (dst.first()->is_stack()) { 1010 // stack to stack 1011 movq(rax, Address(rbp, reg2offset_in(src.first()))); 1012 movq(Address(rsp, reg2offset_out(dst.first())), rax); 1013 } else { 1014 // stack to reg 1015 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()))); 1016 } 1017 } else if (dst.first()->is_stack()) { 1018 // reg to stack 1019 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register()); 1020 } else { 1021 if (dst.first() != src.first()) { 1022 movq(dst.first()->as_Register(), src.first()->as_Register()); 1023 } 1024 } 1025 } 1026 1027 // An oop arg. Must pass a handle not the oop itself 1028 void MacroAssembler::object_move(OopMap* map, 1029 int oop_handle_offset, 1030 int framesize_in_slots, 1031 VMRegPair src, 1032 VMRegPair dst, 1033 bool is_receiver, 1034 int* receiver_offset) { 1035 1036 // must pass a handle. First figure out the location we use as a handle 1037 1038 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register(); 1039 1040 // See if oop is NULL if it is we need no handle 1041 1042 if (src.first()->is_stack()) { 1043 1044 // Oop is already on the stack as an argument 1045 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots(); 1046 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots)); 1047 if (is_receiver) { 1048 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size; 1049 } 1050 1051 cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD); 1052 lea(rHandle, Address(rbp, reg2offset_in(src.first()))); 1053 // conditionally move a NULL 1054 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first()))); 1055 } else { 1056 1057 // Oop is in a register we must store it to the space we reserve 1058 // on the stack for oop_handles and pass a handle if oop is non-NULL 1059 1060 const Register rOop = src.first()->as_Register(); 1061 int oop_slot; 1062 if (rOop == j_rarg0) 1063 oop_slot = 0; 1064 else if (rOop == j_rarg1) 1065 oop_slot = 1; 1066 else if (rOop == j_rarg2) 1067 oop_slot = 2; 1068 else if (rOop == j_rarg3) 1069 oop_slot = 3; 1070 else if (rOop == j_rarg4) 1071 oop_slot = 4; 1072 else { 1073 assert(rOop == j_rarg5, "wrong register"); 1074 oop_slot = 5; 1075 } 1076 1077 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset; 1078 int offset = oop_slot*VMRegImpl::stack_slot_size; 1079 1080 map->set_oop(VMRegImpl::stack2reg(oop_slot)); 1081 // Store oop in handle area, may be NULL 1082 movptr(Address(rsp, offset), rOop); 1083 if (is_receiver) { 1084 *receiver_offset = offset; 1085 } 1086 1087 cmpptr(rOop, NULL_WORD); 1088 lea(rHandle, Address(rsp, offset)); 1089 // conditionally move a NULL from the handle area where it was just stored 1090 cmovptr(Assembler::equal, rHandle, Address(rsp, offset)); 1091 } 1092 1093 // If arg is on the stack then place it otherwise it is already in correct reg. 1094 if (dst.first()->is_stack()) { 1095 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle); 1096 } 1097 } 1098 1099 #endif // _LP64 1100 1101 // Now versions that are common to 32/64 bit 1102 1103 void MacroAssembler::addptr(Register dst, int32_t imm32) { 1104 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32)); 1105 } 1106 1107 void MacroAssembler::addptr(Register dst, Register src) { 1108 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 1109 } 1110 1111 void MacroAssembler::addptr(Address dst, Register src) { 1112 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 1113 } 1114 1115 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1116 assert(rscratch != noreg || always_reachable(src), "missing"); 1117 1118 if (reachable(src)) { 1119 Assembler::addsd(dst, as_Address(src)); 1120 } else { 1121 lea(rscratch, src); 1122 Assembler::addsd(dst, Address(rscratch, 0)); 1123 } 1124 } 1125 1126 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) { 1127 assert(rscratch != noreg || always_reachable(src), "missing"); 1128 1129 if (reachable(src)) { 1130 addss(dst, as_Address(src)); 1131 } else { 1132 lea(rscratch, src); 1133 addss(dst, Address(rscratch, 0)); 1134 } 1135 } 1136 1137 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1138 assert(rscratch != noreg || always_reachable(src), "missing"); 1139 1140 if (reachable(src)) { 1141 Assembler::addpd(dst, as_Address(src)); 1142 } else { 1143 lea(rscratch, src); 1144 Assembler::addpd(dst, Address(rscratch, 0)); 1145 } 1146 } 1147 1148 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only. 1149 // Stub code is generated once and never copied. 1150 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes. 1151 void MacroAssembler::align64() { 1152 align(64, (unsigned long long) pc()); 1153 } 1154 1155 void MacroAssembler::align32() { 1156 align(32, (unsigned long long) pc()); 1157 } 1158 1159 void MacroAssembler::align(int modulus) { 1160 // 8273459: Ensure alignment is possible with current segment alignment 1161 assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment"); 1162 align(modulus, offset()); 1163 } 1164 1165 void MacroAssembler::align(int modulus, int target) { 1166 if (target % modulus != 0) { 1167 nop(modulus - (target % modulus)); 1168 } 1169 } 1170 1171 void MacroAssembler::push_f(XMMRegister r) { 1172 subptr(rsp, wordSize); 1173 movflt(Address(rsp, 0), r); 1174 } 1175 1176 void MacroAssembler::pop_f(XMMRegister r) { 1177 movflt(r, Address(rsp, 0)); 1178 addptr(rsp, wordSize); 1179 } 1180 1181 void MacroAssembler::push_d(XMMRegister r) { 1182 subptr(rsp, 2 * wordSize); 1183 movdbl(Address(rsp, 0), r); 1184 } 1185 1186 void MacroAssembler::pop_d(XMMRegister r) { 1187 movdbl(r, Address(rsp, 0)); 1188 addptr(rsp, 2 * Interpreter::stackElementSize); 1189 } 1190 1191 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1192 // Used in sign-masking with aligned address. 1193 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 1194 assert(rscratch != noreg || always_reachable(src), "missing"); 1195 1196 if (reachable(src)) { 1197 Assembler::andpd(dst, as_Address(src)); 1198 } else { 1199 lea(rscratch, src); 1200 Assembler::andpd(dst, Address(rscratch, 0)); 1201 } 1202 } 1203 1204 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) { 1205 // Used in sign-masking with aligned address. 1206 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 1207 assert(rscratch != noreg || always_reachable(src), "missing"); 1208 1209 if (reachable(src)) { 1210 Assembler::andps(dst, as_Address(src)); 1211 } else { 1212 lea(rscratch, src); 1213 Assembler::andps(dst, Address(rscratch, 0)); 1214 } 1215 } 1216 1217 void MacroAssembler::andptr(Register dst, int32_t imm32) { 1218 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); 1219 } 1220 1221 #ifdef _LP64 1222 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) { 1223 assert(rscratch != noreg || always_reachable(src), "missing"); 1224 1225 if (reachable(src)) { 1226 andq(dst, as_Address(src)); 1227 } else { 1228 lea(rscratch, src); 1229 andq(dst, Address(rscratch, 0)); 1230 } 1231 } 1232 #endif 1233 1234 void MacroAssembler::atomic_incl(Address counter_addr) { 1235 lock(); 1236 incrementl(counter_addr); 1237 } 1238 1239 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) { 1240 assert(rscratch != noreg || always_reachable(counter_addr), "missing"); 1241 1242 if (reachable(counter_addr)) { 1243 atomic_incl(as_Address(counter_addr)); 1244 } else { 1245 lea(rscratch, counter_addr); 1246 atomic_incl(Address(rscratch, 0)); 1247 } 1248 } 1249 1250 #ifdef _LP64 1251 void MacroAssembler::atomic_incq(Address counter_addr) { 1252 lock(); 1253 incrementq(counter_addr); 1254 } 1255 1256 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) { 1257 assert(rscratch != noreg || always_reachable(counter_addr), "missing"); 1258 1259 if (reachable(counter_addr)) { 1260 atomic_incq(as_Address(counter_addr)); 1261 } else { 1262 lea(rscratch, counter_addr); 1263 atomic_incq(Address(rscratch, 0)); 1264 } 1265 } 1266 #endif 1267 1268 // Writes to stack successive pages until offset reached to check for 1269 // stack overflow + shadow pages. This clobbers tmp. 1270 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 1271 movptr(tmp, rsp); 1272 // Bang stack for total size given plus shadow page size. 1273 // Bang one page at a time because large size can bang beyond yellow and 1274 // red zones. 1275 Label loop; 1276 bind(loop); 1277 movl(Address(tmp, (-(int)os::vm_page_size())), size ); 1278 subptr(tmp, (int)os::vm_page_size()); 1279 subl(size, (int)os::vm_page_size()); 1280 jcc(Assembler::greater, loop); 1281 1282 // Bang down shadow pages too. 1283 // At this point, (tmp-0) is the last address touched, so don't 1284 // touch it again. (It was touched as (tmp-pagesize) but then tmp 1285 // was post-decremented.) Skip this address by starting at i=1, and 1286 // touch a few more pages below. N.B. It is important to touch all 1287 // the way down including all pages in the shadow zone. 1288 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) { 1289 // this could be any sized move but this is can be a debugging crumb 1290 // so the bigger the better. 1291 movptr(Address(tmp, (-i*(int)os::vm_page_size())), size ); 1292 } 1293 } 1294 1295 void MacroAssembler::reserved_stack_check() { 1296 // testing if reserved zone needs to be enabled 1297 Label no_reserved_zone_enabling; 1298 Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread); 1299 NOT_LP64(get_thread(rsi);) 1300 1301 cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset())); 1302 jcc(Assembler::below, no_reserved_zone_enabling); 1303 1304 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread); 1305 jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 1306 should_not_reach_here(); 1307 1308 bind(no_reserved_zone_enabling); 1309 } 1310 1311 void MacroAssembler::c2bool(Register x) { 1312 // implements x == 0 ? 0 : 1 1313 // note: must only look at least-significant byte of x 1314 // since C-style booleans are stored in one byte 1315 // only! (was bug) 1316 andl(x, 0xFF); 1317 setb(Assembler::notZero, x); 1318 } 1319 1320 // Wouldn't need if AddressLiteral version had new name 1321 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) { 1322 Assembler::call(L, rtype); 1323 } 1324 1325 void MacroAssembler::call(Register entry) { 1326 Assembler::call(entry); 1327 } 1328 1329 void MacroAssembler::call(AddressLiteral entry, Register rscratch) { 1330 assert(rscratch != noreg || always_reachable(entry), "missing"); 1331 1332 if (reachable(entry)) { 1333 Assembler::call_literal(entry.target(), entry.rspec()); 1334 } else { 1335 lea(rscratch, entry); 1336 Assembler::call(rscratch); 1337 } 1338 } 1339 1340 void MacroAssembler::ic_call(address entry, jint method_index) { 1341 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 1342 movptr(rax, (intptr_t)Universe::non_oop_word()); 1343 call(AddressLiteral(entry, rh)); 1344 } 1345 1346 void MacroAssembler::emit_static_call_stub() { 1347 // Static stub relocation also tags the Method* in the code-stream. 1348 mov_metadata(rbx, (Metadata*) NULL); // Method is zapped till fixup time. 1349 // This is recognized as unresolved by relocs/nativeinst/ic code. 1350 jump(RuntimeAddress(pc())); 1351 } 1352 1353 // Implementation of call_VM versions 1354 1355 void MacroAssembler::call_VM(Register oop_result, 1356 address entry_point, 1357 bool check_exceptions) { 1358 Label C, E; 1359 call(C, relocInfo::none); 1360 jmp(E); 1361 1362 bind(C); 1363 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 1364 ret(0); 1365 1366 bind(E); 1367 } 1368 1369 void MacroAssembler::call_VM(Register oop_result, 1370 address entry_point, 1371 Register arg_1, 1372 bool check_exceptions) { 1373 Label C, E; 1374 call(C, relocInfo::none); 1375 jmp(E); 1376 1377 bind(C); 1378 pass_arg1(this, arg_1); 1379 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 1380 ret(0); 1381 1382 bind(E); 1383 } 1384 1385 void MacroAssembler::call_VM(Register oop_result, 1386 address entry_point, 1387 Register arg_1, 1388 Register arg_2, 1389 bool check_exceptions) { 1390 Label C, E; 1391 call(C, relocInfo::none); 1392 jmp(E); 1393 1394 bind(C); 1395 1396 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1397 1398 pass_arg2(this, arg_2); 1399 pass_arg1(this, arg_1); 1400 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 1401 ret(0); 1402 1403 bind(E); 1404 } 1405 1406 void MacroAssembler::call_VM(Register oop_result, 1407 address entry_point, 1408 Register arg_1, 1409 Register arg_2, 1410 Register arg_3, 1411 bool check_exceptions) { 1412 Label C, E; 1413 call(C, relocInfo::none); 1414 jmp(E); 1415 1416 bind(C); 1417 1418 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1419 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1420 pass_arg3(this, arg_3); 1421 1422 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1423 pass_arg2(this, arg_2); 1424 1425 pass_arg1(this, arg_1); 1426 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 1427 ret(0); 1428 1429 bind(E); 1430 } 1431 1432 void MacroAssembler::call_VM(Register oop_result, 1433 Register last_java_sp, 1434 address entry_point, 1435 int number_of_arguments, 1436 bool check_exceptions) { 1437 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 1438 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1439 } 1440 1441 void MacroAssembler::call_VM(Register oop_result, 1442 Register last_java_sp, 1443 address entry_point, 1444 Register arg_1, 1445 bool check_exceptions) { 1446 pass_arg1(this, arg_1); 1447 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1448 } 1449 1450 void MacroAssembler::call_VM(Register oop_result, 1451 Register last_java_sp, 1452 address entry_point, 1453 Register arg_1, 1454 Register arg_2, 1455 bool check_exceptions) { 1456 1457 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1458 pass_arg2(this, arg_2); 1459 pass_arg1(this, arg_1); 1460 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1461 } 1462 1463 void MacroAssembler::call_VM(Register oop_result, 1464 Register last_java_sp, 1465 address entry_point, 1466 Register arg_1, 1467 Register arg_2, 1468 Register arg_3, 1469 bool check_exceptions) { 1470 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1471 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1472 pass_arg3(this, arg_3); 1473 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1474 pass_arg2(this, arg_2); 1475 pass_arg1(this, arg_1); 1476 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1477 } 1478 1479 void MacroAssembler::super_call_VM(Register oop_result, 1480 Register last_java_sp, 1481 address entry_point, 1482 int number_of_arguments, 1483 bool check_exceptions) { 1484 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 1485 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1486 } 1487 1488 void MacroAssembler::super_call_VM(Register oop_result, 1489 Register last_java_sp, 1490 address entry_point, 1491 Register arg_1, 1492 bool check_exceptions) { 1493 pass_arg1(this, arg_1); 1494 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1495 } 1496 1497 void MacroAssembler::super_call_VM(Register oop_result, 1498 Register last_java_sp, 1499 address entry_point, 1500 Register arg_1, 1501 Register arg_2, 1502 bool check_exceptions) { 1503 1504 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1505 pass_arg2(this, arg_2); 1506 pass_arg1(this, arg_1); 1507 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1508 } 1509 1510 void MacroAssembler::super_call_VM(Register oop_result, 1511 Register last_java_sp, 1512 address entry_point, 1513 Register arg_1, 1514 Register arg_2, 1515 Register arg_3, 1516 bool check_exceptions) { 1517 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1518 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1519 pass_arg3(this, arg_3); 1520 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1521 pass_arg2(this, arg_2); 1522 pass_arg1(this, arg_1); 1523 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1524 } 1525 1526 void MacroAssembler::call_VM_base(Register oop_result, 1527 Register java_thread, 1528 Register last_java_sp, 1529 address entry_point, 1530 int number_of_arguments, 1531 bool check_exceptions) { 1532 // determine java_thread register 1533 if (!java_thread->is_valid()) { 1534 #ifdef _LP64 1535 java_thread = r15_thread; 1536 #else 1537 java_thread = rdi; 1538 get_thread(java_thread); 1539 #endif // LP64 1540 } 1541 // determine last_java_sp register 1542 if (!last_java_sp->is_valid()) { 1543 last_java_sp = rsp; 1544 } 1545 // debugging support 1546 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 1547 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register")); 1548 #ifdef ASSERT 1549 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 1550 // r12 is the heapbase. 1551 LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");) 1552 #endif // ASSERT 1553 1554 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 1555 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 1556 1557 // push java thread (becomes first argument of C function) 1558 1559 NOT_LP64(push(java_thread); number_of_arguments++); 1560 LP64_ONLY(mov(c_rarg0, r15_thread)); 1561 1562 // set last Java frame before call 1563 assert(last_java_sp != rbp, "can't use ebp/rbp"); 1564 1565 // Only interpreter should have to set fp 1566 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL, rscratch1); 1567 1568 // do the call, remove parameters 1569 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments); 1570 1571 // restore the thread (cannot use the pushed argument since arguments 1572 // may be overwritten by C code generated by an optimizing compiler); 1573 // however can use the register value directly if it is callee saved. 1574 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) { 1575 // rdi & rsi (also r15) are callee saved -> nothing to do 1576 #ifdef ASSERT 1577 guarantee(java_thread != rax, "change this code"); 1578 push(rax); 1579 { Label L; 1580 get_thread(rax); 1581 cmpptr(java_thread, rax); 1582 jcc(Assembler::equal, L); 1583 STOP("MacroAssembler::call_VM_base: rdi not callee saved?"); 1584 bind(L); 1585 } 1586 pop(rax); 1587 #endif 1588 } else { 1589 get_thread(java_thread); 1590 } 1591 // reset last Java frame 1592 // Only interpreter should have to clear fp 1593 reset_last_Java_frame(java_thread, true); 1594 1595 // C++ interp handles this in the interpreter 1596 check_and_handle_popframe(java_thread); 1597 check_and_handle_earlyret(java_thread); 1598 1599 if (check_exceptions) { 1600 // check for pending exceptions (java_thread is set upon return) 1601 cmpptr(Address(java_thread, Thread::pending_exception_offset()), NULL_WORD); 1602 #ifndef _LP64 1603 jump_cc(Assembler::notEqual, 1604 RuntimeAddress(StubRoutines::forward_exception_entry())); 1605 #else 1606 // This used to conditionally jump to forward_exception however it is 1607 // possible if we relocate that the branch will not reach. So we must jump 1608 // around so we can always reach 1609 1610 Label ok; 1611 jcc(Assembler::equal, ok); 1612 jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1613 bind(ok); 1614 #endif // LP64 1615 } 1616 1617 // get oop result if there is one and reset the value in the thread 1618 if (oop_result->is_valid()) { 1619 get_vm_result(oop_result, java_thread); 1620 } 1621 } 1622 1623 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 1624 1625 // Calculate the value for last_Java_sp 1626 // somewhat subtle. call_VM does an intermediate call 1627 // which places a return address on the stack just under the 1628 // stack pointer as the user finished with it. This allows 1629 // use to retrieve last_Java_pc from last_Java_sp[-1]. 1630 // On 32bit we then have to push additional args on the stack to accomplish 1631 // the actual requested call. On 64bit call_VM only can use register args 1632 // so the only extra space is the return address that call_VM created. 1633 // This hopefully explains the calculations here. 1634 1635 #ifdef _LP64 1636 // We've pushed one address, correct last_Java_sp 1637 lea(rax, Address(rsp, wordSize)); 1638 #else 1639 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize)); 1640 #endif // LP64 1641 1642 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions); 1643 1644 } 1645 1646 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter. 1647 void MacroAssembler::call_VM_leaf0(address entry_point) { 1648 MacroAssembler::call_VM_leaf_base(entry_point, 0); 1649 } 1650 1651 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1652 call_VM_leaf_base(entry_point, number_of_arguments); 1653 } 1654 1655 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1656 pass_arg0(this, arg_0); 1657 call_VM_leaf(entry_point, 1); 1658 } 1659 1660 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1661 1662 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1663 pass_arg1(this, arg_1); 1664 pass_arg0(this, arg_0); 1665 call_VM_leaf(entry_point, 2); 1666 } 1667 1668 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1669 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1670 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1671 pass_arg2(this, arg_2); 1672 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1673 pass_arg1(this, arg_1); 1674 pass_arg0(this, arg_0); 1675 call_VM_leaf(entry_point, 3); 1676 } 1677 1678 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1679 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg")); 1680 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1681 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1682 pass_arg3(this, arg_3); 1683 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1684 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1685 pass_arg2(this, arg_2); 1686 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1687 pass_arg1(this, arg_1); 1688 pass_arg0(this, arg_0); 1689 call_VM_leaf(entry_point, 3); 1690 } 1691 1692 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1693 pass_arg0(this, arg_0); 1694 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1695 } 1696 1697 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1698 1699 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1700 pass_arg1(this, arg_1); 1701 pass_arg0(this, arg_0); 1702 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1703 } 1704 1705 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1706 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1707 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1708 pass_arg2(this, arg_2); 1709 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1710 pass_arg1(this, arg_1); 1711 pass_arg0(this, arg_0); 1712 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1713 } 1714 1715 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1716 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg")); 1717 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1718 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1719 pass_arg3(this, arg_3); 1720 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1721 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1722 pass_arg2(this, arg_2); 1723 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1724 pass_arg1(this, arg_1); 1725 pass_arg0(this, arg_0); 1726 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1727 } 1728 1729 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 1730 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 1731 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD); 1732 verify_oop_msg(oop_result, "broken oop in call_VM_base"); 1733 } 1734 1735 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 1736 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 1737 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD); 1738 } 1739 1740 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { 1741 } 1742 1743 void MacroAssembler::check_and_handle_popframe(Register java_thread) { 1744 } 1745 1746 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) { 1747 assert(rscratch != noreg || always_reachable(src1), "missing"); 1748 1749 if (reachable(src1)) { 1750 cmpl(as_Address(src1), imm); 1751 } else { 1752 lea(rscratch, src1); 1753 cmpl(Address(rscratch, 0), imm); 1754 } 1755 } 1756 1757 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) { 1758 assert(!src2.is_lval(), "use cmpptr"); 1759 assert(rscratch != noreg || always_reachable(src2), "missing"); 1760 1761 if (reachable(src2)) { 1762 cmpl(src1, as_Address(src2)); 1763 } else { 1764 lea(rscratch, src2); 1765 cmpl(src1, Address(rscratch, 0)); 1766 } 1767 } 1768 1769 void MacroAssembler::cmp32(Register src1, int32_t imm) { 1770 Assembler::cmpl(src1, imm); 1771 } 1772 1773 void MacroAssembler::cmp32(Register src1, Address src2) { 1774 Assembler::cmpl(src1, src2); 1775 } 1776 1777 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1778 ucomisd(opr1, opr2); 1779 1780 Label L; 1781 if (unordered_is_less) { 1782 movl(dst, -1); 1783 jcc(Assembler::parity, L); 1784 jcc(Assembler::below , L); 1785 movl(dst, 0); 1786 jcc(Assembler::equal , L); 1787 increment(dst); 1788 } else { // unordered is greater 1789 movl(dst, 1); 1790 jcc(Assembler::parity, L); 1791 jcc(Assembler::above , L); 1792 movl(dst, 0); 1793 jcc(Assembler::equal , L); 1794 decrementl(dst); 1795 } 1796 bind(L); 1797 } 1798 1799 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1800 ucomiss(opr1, opr2); 1801 1802 Label L; 1803 if (unordered_is_less) { 1804 movl(dst, -1); 1805 jcc(Assembler::parity, L); 1806 jcc(Assembler::below , L); 1807 movl(dst, 0); 1808 jcc(Assembler::equal , L); 1809 increment(dst); 1810 } else { // unordered is greater 1811 movl(dst, 1); 1812 jcc(Assembler::parity, L); 1813 jcc(Assembler::above , L); 1814 movl(dst, 0); 1815 jcc(Assembler::equal , L); 1816 decrementl(dst); 1817 } 1818 bind(L); 1819 } 1820 1821 1822 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) { 1823 assert(rscratch != noreg || always_reachable(src1), "missing"); 1824 1825 if (reachable(src1)) { 1826 cmpb(as_Address(src1), imm); 1827 } else { 1828 lea(rscratch, src1); 1829 cmpb(Address(rscratch, 0), imm); 1830 } 1831 } 1832 1833 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) { 1834 #ifdef _LP64 1835 assert(rscratch != noreg || always_reachable(src2), "missing"); 1836 1837 if (src2.is_lval()) { 1838 movptr(rscratch, src2); 1839 Assembler::cmpq(src1, rscratch); 1840 } else if (reachable(src2)) { 1841 cmpq(src1, as_Address(src2)); 1842 } else { 1843 lea(rscratch, src2); 1844 Assembler::cmpq(src1, Address(rscratch, 0)); 1845 } 1846 #else 1847 assert(rscratch == noreg, "not needed"); 1848 if (src2.is_lval()) { 1849 cmp_literal32(src1, (int32_t)src2.target(), src2.rspec()); 1850 } else { 1851 cmpl(src1, as_Address(src2)); 1852 } 1853 #endif // _LP64 1854 } 1855 1856 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) { 1857 assert(src2.is_lval(), "not a mem-mem compare"); 1858 #ifdef _LP64 1859 // moves src2's literal address 1860 movptr(rscratch, src2); 1861 Assembler::cmpq(src1, rscratch); 1862 #else 1863 assert(rscratch == noreg, "not needed"); 1864 cmp_literal32(src1, (int32_t)src2.target(), src2.rspec()); 1865 #endif // _LP64 1866 } 1867 1868 void MacroAssembler::cmpoop(Register src1, Register src2) { 1869 cmpptr(src1, src2); 1870 } 1871 1872 void MacroAssembler::cmpoop(Register src1, Address src2) { 1873 cmpptr(src1, src2); 1874 } 1875 1876 #ifdef _LP64 1877 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) { 1878 movoop(rscratch, src2); 1879 cmpptr(src1, rscratch); 1880 } 1881 #endif 1882 1883 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) { 1884 assert(rscratch != noreg || always_reachable(adr), "missing"); 1885 1886 if (reachable(adr)) { 1887 lock(); 1888 cmpxchgptr(reg, as_Address(adr)); 1889 } else { 1890 lea(rscratch, adr); 1891 lock(); 1892 cmpxchgptr(reg, Address(rscratch, 0)); 1893 } 1894 } 1895 1896 void MacroAssembler::cmpxchgptr(Register reg, Address adr) { 1897 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr)); 1898 } 1899 1900 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1901 assert(rscratch != noreg || always_reachable(src), "missing"); 1902 1903 if (reachable(src)) { 1904 Assembler::comisd(dst, as_Address(src)); 1905 } else { 1906 lea(rscratch, src); 1907 Assembler::comisd(dst, Address(rscratch, 0)); 1908 } 1909 } 1910 1911 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) { 1912 assert(rscratch != noreg || always_reachable(src), "missing"); 1913 1914 if (reachable(src)) { 1915 Assembler::comiss(dst, as_Address(src)); 1916 } else { 1917 lea(rscratch, src); 1918 Assembler::comiss(dst, Address(rscratch, 0)); 1919 } 1920 } 1921 1922 1923 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) { 1924 assert(rscratch != noreg || always_reachable(counter_addr), "missing"); 1925 1926 Condition negated_cond = negate_condition(cond); 1927 Label L; 1928 jcc(negated_cond, L); 1929 pushf(); // Preserve flags 1930 atomic_incl(counter_addr, rscratch); 1931 popf(); 1932 bind(L); 1933 } 1934 1935 int MacroAssembler::corrected_idivl(Register reg) { 1936 // Full implementation of Java idiv and irem; checks for 1937 // special case as described in JVM spec., p.243 & p.271. 1938 // The function returns the (pc) offset of the idivl 1939 // instruction - may be needed for implicit exceptions. 1940 // 1941 // normal case special case 1942 // 1943 // input : rax,: dividend min_int 1944 // reg: divisor (may not be rax,/rdx) -1 1945 // 1946 // output: rax,: quotient (= rax, idiv reg) min_int 1947 // rdx: remainder (= rax, irem reg) 0 1948 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register"); 1949 const int min_int = 0x80000000; 1950 Label normal_case, special_case; 1951 1952 // check for special case 1953 cmpl(rax, min_int); 1954 jcc(Assembler::notEqual, normal_case); 1955 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0) 1956 cmpl(reg, -1); 1957 jcc(Assembler::equal, special_case); 1958 1959 // handle normal case 1960 bind(normal_case); 1961 cdql(); 1962 int idivl_offset = offset(); 1963 idivl(reg); 1964 1965 // normal and special case exit 1966 bind(special_case); 1967 1968 return idivl_offset; 1969 } 1970 1971 1972 1973 void MacroAssembler::decrementl(Register reg, int value) { 1974 if (value == min_jint) {subl(reg, value) ; return; } 1975 if (value < 0) { incrementl(reg, -value); return; } 1976 if (value == 0) { ; return; } 1977 if (value == 1 && UseIncDec) { decl(reg) ; return; } 1978 /* else */ { subl(reg, value) ; return; } 1979 } 1980 1981 void MacroAssembler::decrementl(Address dst, int value) { 1982 if (value == min_jint) {subl(dst, value) ; return; } 1983 if (value < 0) { incrementl(dst, -value); return; } 1984 if (value == 0) { ; return; } 1985 if (value == 1 && UseIncDec) { decl(dst) ; return; } 1986 /* else */ { subl(dst, value) ; return; } 1987 } 1988 1989 void MacroAssembler::division_with_shift (Register reg, int shift_value) { 1990 assert(shift_value > 0, "illegal shift value"); 1991 Label _is_positive; 1992 testl (reg, reg); 1993 jcc (Assembler::positive, _is_positive); 1994 int offset = (1 << shift_value) - 1 ; 1995 1996 if (offset == 1) { 1997 incrementl(reg); 1998 } else { 1999 addl(reg, offset); 2000 } 2001 2002 bind (_is_positive); 2003 sarl(reg, shift_value); 2004 } 2005 2006 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2007 assert(rscratch != noreg || always_reachable(src), "missing"); 2008 2009 if (reachable(src)) { 2010 Assembler::divsd(dst, as_Address(src)); 2011 } else { 2012 lea(rscratch, src); 2013 Assembler::divsd(dst, Address(rscratch, 0)); 2014 } 2015 } 2016 2017 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) { 2018 assert(rscratch != noreg || always_reachable(src), "missing"); 2019 2020 if (reachable(src)) { 2021 Assembler::divss(dst, as_Address(src)); 2022 } else { 2023 lea(rscratch, src); 2024 Assembler::divss(dst, Address(rscratch, 0)); 2025 } 2026 } 2027 2028 void MacroAssembler::enter() { 2029 push(rbp); 2030 mov(rbp, rsp); 2031 } 2032 2033 void MacroAssembler::post_call_nop() { 2034 if (!Continuations::enabled()) { 2035 return; 2036 } 2037 InstructionMark im(this); 2038 relocate(post_call_nop_Relocation::spec()); 2039 InlineSkippedInstructionsCounter skipCounter(this); 2040 emit_int8((int8_t)0x0f); 2041 emit_int8((int8_t)0x1f); 2042 emit_int8((int8_t)0x84); 2043 emit_int8((int8_t)0x00); 2044 emit_int32(0x00); 2045 } 2046 2047 // A 5 byte nop that is safe for patching (see patch_verified_entry) 2048 void MacroAssembler::fat_nop() { 2049 if (UseAddressNop) { 2050 addr_nop_5(); 2051 } else { 2052 emit_int8((int8_t)0x26); // es: 2053 emit_int8((int8_t)0x2e); // cs: 2054 emit_int8((int8_t)0x64); // fs: 2055 emit_int8((int8_t)0x65); // gs: 2056 emit_int8((int8_t)0x90); 2057 } 2058 } 2059 2060 #ifndef _LP64 2061 void MacroAssembler::fcmp(Register tmp) { 2062 fcmp(tmp, 1, true, true); 2063 } 2064 2065 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) { 2066 assert(!pop_right || pop_left, "usage error"); 2067 if (VM_Version::supports_cmov()) { 2068 assert(tmp == noreg, "unneeded temp"); 2069 if (pop_left) { 2070 fucomip(index); 2071 } else { 2072 fucomi(index); 2073 } 2074 if (pop_right) { 2075 fpop(); 2076 } 2077 } else { 2078 assert(tmp != noreg, "need temp"); 2079 if (pop_left) { 2080 if (pop_right) { 2081 fcompp(); 2082 } else { 2083 fcomp(index); 2084 } 2085 } else { 2086 fcom(index); 2087 } 2088 // convert FPU condition into eflags condition via rax, 2089 save_rax(tmp); 2090 fwait(); fnstsw_ax(); 2091 sahf(); 2092 restore_rax(tmp); 2093 } 2094 // condition codes set as follows: 2095 // 2096 // CF (corresponds to C0) if x < y 2097 // PF (corresponds to C2) if unordered 2098 // ZF (corresponds to C3) if x = y 2099 } 2100 2101 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) { 2102 fcmp2int(dst, unordered_is_less, 1, true, true); 2103 } 2104 2105 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) { 2106 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right); 2107 Label L; 2108 if (unordered_is_less) { 2109 movl(dst, -1); 2110 jcc(Assembler::parity, L); 2111 jcc(Assembler::below , L); 2112 movl(dst, 0); 2113 jcc(Assembler::equal , L); 2114 increment(dst); 2115 } else { // unordered is greater 2116 movl(dst, 1); 2117 jcc(Assembler::parity, L); 2118 jcc(Assembler::above , L); 2119 movl(dst, 0); 2120 jcc(Assembler::equal , L); 2121 decrementl(dst); 2122 } 2123 bind(L); 2124 } 2125 2126 void MacroAssembler::fld_d(AddressLiteral src) { 2127 fld_d(as_Address(src)); 2128 } 2129 2130 void MacroAssembler::fld_s(AddressLiteral src) { 2131 fld_s(as_Address(src)); 2132 } 2133 2134 void MacroAssembler::fldcw(AddressLiteral src) { 2135 fldcw(as_Address(src)); 2136 } 2137 2138 void MacroAssembler::fpop() { 2139 ffree(); 2140 fincstp(); 2141 } 2142 2143 void MacroAssembler::fremr(Register tmp) { 2144 save_rax(tmp); 2145 { Label L; 2146 bind(L); 2147 fprem(); 2148 fwait(); fnstsw_ax(); 2149 sahf(); 2150 jcc(Assembler::parity, L); 2151 } 2152 restore_rax(tmp); 2153 // Result is in ST0. 2154 // Note: fxch & fpop to get rid of ST1 2155 // (otherwise FPU stack could overflow eventually) 2156 fxch(1); 2157 fpop(); 2158 } 2159 2160 void MacroAssembler::empty_FPU_stack() { 2161 if (VM_Version::supports_mmx()) { 2162 emms(); 2163 } else { 2164 for (int i = 8; i-- > 0; ) ffree(i); 2165 } 2166 } 2167 #endif // !LP64 2168 2169 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2170 assert(rscratch != noreg || always_reachable(src), "missing"); 2171 if (reachable(src)) { 2172 Assembler::mulpd(dst, as_Address(src)); 2173 } else { 2174 lea(rscratch, src); 2175 Assembler::mulpd(dst, Address(rscratch, 0)); 2176 } 2177 } 2178 2179 void MacroAssembler::load_float(Address src) { 2180 #ifdef _LP64 2181 movflt(xmm0, src); 2182 #else 2183 if (UseSSE >= 1) { 2184 movflt(xmm0, src); 2185 } else { 2186 fld_s(src); 2187 } 2188 #endif // LP64 2189 } 2190 2191 void MacroAssembler::store_float(Address dst) { 2192 #ifdef _LP64 2193 movflt(dst, xmm0); 2194 #else 2195 if (UseSSE >= 1) { 2196 movflt(dst, xmm0); 2197 } else { 2198 fstp_s(dst); 2199 } 2200 #endif // LP64 2201 } 2202 2203 void MacroAssembler::load_double(Address src) { 2204 #ifdef _LP64 2205 movdbl(xmm0, src); 2206 #else 2207 if (UseSSE >= 2) { 2208 movdbl(xmm0, src); 2209 } else { 2210 fld_d(src); 2211 } 2212 #endif // LP64 2213 } 2214 2215 void MacroAssembler::store_double(Address dst) { 2216 #ifdef _LP64 2217 movdbl(dst, xmm0); 2218 #else 2219 if (UseSSE >= 2) { 2220 movdbl(dst, xmm0); 2221 } else { 2222 fstp_d(dst); 2223 } 2224 #endif // LP64 2225 } 2226 2227 // dst = c = a * b + c 2228 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2229 Assembler::vfmadd231sd(c, a, b); 2230 if (dst != c) { 2231 movdbl(dst, c); 2232 } 2233 } 2234 2235 // dst = c = a * b + c 2236 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2237 Assembler::vfmadd231ss(c, a, b); 2238 if (dst != c) { 2239 movflt(dst, c); 2240 } 2241 } 2242 2243 // dst = c = a * b + c 2244 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2245 Assembler::vfmadd231pd(c, a, b, vector_len); 2246 if (dst != c) { 2247 vmovdqu(dst, c); 2248 } 2249 } 2250 2251 // dst = c = a * b + c 2252 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2253 Assembler::vfmadd231ps(c, a, b, vector_len); 2254 if (dst != c) { 2255 vmovdqu(dst, c); 2256 } 2257 } 2258 2259 // dst = c = a * b + c 2260 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2261 Assembler::vfmadd231pd(c, a, b, vector_len); 2262 if (dst != c) { 2263 vmovdqu(dst, c); 2264 } 2265 } 2266 2267 // dst = c = a * b + c 2268 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2269 Assembler::vfmadd231ps(c, a, b, vector_len); 2270 if (dst != c) { 2271 vmovdqu(dst, c); 2272 } 2273 } 2274 2275 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) { 2276 assert(rscratch != noreg || always_reachable(dst), "missing"); 2277 2278 if (reachable(dst)) { 2279 incrementl(as_Address(dst)); 2280 } else { 2281 lea(rscratch, dst); 2282 incrementl(Address(rscratch, 0)); 2283 } 2284 } 2285 2286 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) { 2287 incrementl(as_Address(dst, rscratch)); 2288 } 2289 2290 void MacroAssembler::incrementl(Register reg, int value) { 2291 if (value == min_jint) {addl(reg, value) ; return; } 2292 if (value < 0) { decrementl(reg, -value); return; } 2293 if (value == 0) { ; return; } 2294 if (value == 1 && UseIncDec) { incl(reg) ; return; } 2295 /* else */ { addl(reg, value) ; return; } 2296 } 2297 2298 void MacroAssembler::incrementl(Address dst, int value) { 2299 if (value == min_jint) {addl(dst, value) ; return; } 2300 if (value < 0) { decrementl(dst, -value); return; } 2301 if (value == 0) { ; return; } 2302 if (value == 1 && UseIncDec) { incl(dst) ; return; } 2303 /* else */ { addl(dst, value) ; return; } 2304 } 2305 2306 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) { 2307 assert(rscratch != noreg || always_reachable(dst), "missing"); 2308 2309 if (reachable(dst)) { 2310 jmp_literal(dst.target(), dst.rspec()); 2311 } else { 2312 lea(rscratch, dst); 2313 jmp(rscratch); 2314 } 2315 } 2316 2317 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) { 2318 assert(rscratch != noreg || always_reachable(dst), "missing"); 2319 2320 if (reachable(dst)) { 2321 InstructionMark im(this); 2322 relocate(dst.reloc()); 2323 const int short_size = 2; 2324 const int long_size = 6; 2325 int offs = (intptr_t)dst.target() - ((intptr_t)pc()); 2326 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) { 2327 // 0111 tttn #8-bit disp 2328 emit_int8(0x70 | cc); 2329 emit_int8((offs - short_size) & 0xFF); 2330 } else { 2331 // 0000 1111 1000 tttn #32-bit disp 2332 emit_int8(0x0F); 2333 emit_int8((unsigned char)(0x80 | cc)); 2334 emit_int32(offs - long_size); 2335 } 2336 } else { 2337 #ifdef ASSERT 2338 warning("reversing conditional branch"); 2339 #endif /* ASSERT */ 2340 Label skip; 2341 jccb(reverse[cc], skip); 2342 lea(rscratch, dst); 2343 Assembler::jmp(rscratch); 2344 bind(skip); 2345 } 2346 } 2347 2348 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) { 2349 assert(rscratch != noreg || always_reachable(src), "missing"); 2350 2351 if (reachable(src)) { 2352 Assembler::ldmxcsr(as_Address(src)); 2353 } else { 2354 lea(rscratch, src); 2355 Assembler::ldmxcsr(Address(rscratch, 0)); 2356 } 2357 } 2358 2359 int MacroAssembler::load_signed_byte(Register dst, Address src) { 2360 int off; 2361 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2362 off = offset(); 2363 movsbl(dst, src); // movsxb 2364 } else { 2365 off = load_unsigned_byte(dst, src); 2366 shll(dst, 24); 2367 sarl(dst, 24); 2368 } 2369 return off; 2370 } 2371 2372 // Note: load_signed_short used to be called load_signed_word. 2373 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler 2374 // manual, which means 16 bits, that usage is found nowhere in HotSpot code. 2375 // The term "word" in HotSpot means a 32- or 64-bit machine word. 2376 int MacroAssembler::load_signed_short(Register dst, Address src) { 2377 int off; 2378 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2379 // This is dubious to me since it seems safe to do a signed 16 => 64 bit 2380 // version but this is what 64bit has always done. This seems to imply 2381 // that users are only using 32bits worth. 2382 off = offset(); 2383 movswl(dst, src); // movsxw 2384 } else { 2385 off = load_unsigned_short(dst, src); 2386 shll(dst, 16); 2387 sarl(dst, 16); 2388 } 2389 return off; 2390 } 2391 2392 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 2393 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2394 // and "3.9 Partial Register Penalties", p. 22). 2395 int off; 2396 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) { 2397 off = offset(); 2398 movzbl(dst, src); // movzxb 2399 } else { 2400 xorl(dst, dst); 2401 off = offset(); 2402 movb(dst, src); 2403 } 2404 return off; 2405 } 2406 2407 // Note: load_unsigned_short used to be called load_unsigned_word. 2408 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 2409 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2410 // and "3.9 Partial Register Penalties", p. 22). 2411 int off; 2412 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) { 2413 off = offset(); 2414 movzwl(dst, src); // movzxw 2415 } else { 2416 xorl(dst, dst); 2417 off = offset(); 2418 movw(dst, src); 2419 } 2420 return off; 2421 } 2422 2423 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 2424 switch (size_in_bytes) { 2425 #ifndef _LP64 2426 case 8: 2427 assert(dst2 != noreg, "second dest register required"); 2428 movl(dst, src); 2429 movl(dst2, src.plus_disp(BytesPerInt)); 2430 break; 2431 #else 2432 case 8: movq(dst, src); break; 2433 #endif 2434 case 4: movl(dst, src); break; 2435 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 2436 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 2437 default: ShouldNotReachHere(); 2438 } 2439 } 2440 2441 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 2442 switch (size_in_bytes) { 2443 #ifndef _LP64 2444 case 8: 2445 assert(src2 != noreg, "second source register required"); 2446 movl(dst, src); 2447 movl(dst.plus_disp(BytesPerInt), src2); 2448 break; 2449 #else 2450 case 8: movq(dst, src); break; 2451 #endif 2452 case 4: movl(dst, src); break; 2453 case 2: movw(dst, src); break; 2454 case 1: movb(dst, src); break; 2455 default: ShouldNotReachHere(); 2456 } 2457 } 2458 2459 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) { 2460 assert(rscratch != noreg || always_reachable(dst), "missing"); 2461 2462 if (reachable(dst)) { 2463 movl(as_Address(dst), src); 2464 } else { 2465 lea(rscratch, dst); 2466 movl(Address(rscratch, 0), src); 2467 } 2468 } 2469 2470 void MacroAssembler::mov32(Register dst, AddressLiteral src) { 2471 if (reachable(src)) { 2472 movl(dst, as_Address(src)); 2473 } else { 2474 lea(dst, src); 2475 movl(dst, Address(dst, 0)); 2476 } 2477 } 2478 2479 // C++ bool manipulation 2480 2481 void MacroAssembler::movbool(Register dst, Address src) { 2482 if(sizeof(bool) == 1) 2483 movb(dst, src); 2484 else if(sizeof(bool) == 2) 2485 movw(dst, src); 2486 else if(sizeof(bool) == 4) 2487 movl(dst, src); 2488 else 2489 // unsupported 2490 ShouldNotReachHere(); 2491 } 2492 2493 void MacroAssembler::movbool(Address dst, bool boolconst) { 2494 if(sizeof(bool) == 1) 2495 movb(dst, (int) boolconst); 2496 else if(sizeof(bool) == 2) 2497 movw(dst, (int) boolconst); 2498 else if(sizeof(bool) == 4) 2499 movl(dst, (int) boolconst); 2500 else 2501 // unsupported 2502 ShouldNotReachHere(); 2503 } 2504 2505 void MacroAssembler::movbool(Address dst, Register src) { 2506 if(sizeof(bool) == 1) 2507 movb(dst, src); 2508 else if(sizeof(bool) == 2) 2509 movw(dst, src); 2510 else if(sizeof(bool) == 4) 2511 movl(dst, src); 2512 else 2513 // unsupported 2514 ShouldNotReachHere(); 2515 } 2516 2517 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) { 2518 assert(rscratch != noreg || always_reachable(src), "missing"); 2519 2520 if (reachable(src)) { 2521 movdl(dst, as_Address(src)); 2522 } else { 2523 lea(rscratch, src); 2524 movdl(dst, Address(rscratch, 0)); 2525 } 2526 } 2527 2528 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) { 2529 assert(rscratch != noreg || always_reachable(src), "missing"); 2530 2531 if (reachable(src)) { 2532 movq(dst, as_Address(src)); 2533 } else { 2534 lea(rscratch, src); 2535 movq(dst, Address(rscratch, 0)); 2536 } 2537 } 2538 2539 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) { 2540 assert(rscratch != noreg || always_reachable(src), "missing"); 2541 2542 if (reachable(src)) { 2543 if (UseXmmLoadAndClearUpper) { 2544 movsd (dst, as_Address(src)); 2545 } else { 2546 movlpd(dst, as_Address(src)); 2547 } 2548 } else { 2549 lea(rscratch, src); 2550 if (UseXmmLoadAndClearUpper) { 2551 movsd (dst, Address(rscratch, 0)); 2552 } else { 2553 movlpd(dst, Address(rscratch, 0)); 2554 } 2555 } 2556 } 2557 2558 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) { 2559 assert(rscratch != noreg || always_reachable(src), "missing"); 2560 2561 if (reachable(src)) { 2562 movss(dst, as_Address(src)); 2563 } else { 2564 lea(rscratch, src); 2565 movss(dst, Address(rscratch, 0)); 2566 } 2567 } 2568 2569 void MacroAssembler::movptr(Register dst, Register src) { 2570 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2571 } 2572 2573 void MacroAssembler::movptr(Register dst, Address src) { 2574 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2575 } 2576 2577 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 2578 void MacroAssembler::movptr(Register dst, intptr_t src) { 2579 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src)); 2580 } 2581 2582 void MacroAssembler::movptr(Address dst, Register src) { 2583 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2584 } 2585 2586 void MacroAssembler::movptr(Address dst, int32_t src) { 2587 LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); 2588 } 2589 2590 void MacroAssembler::movdqu(Address dst, XMMRegister src) { 2591 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2592 Assembler::movdqu(dst, src); 2593 } 2594 2595 void MacroAssembler::movdqu(XMMRegister dst, Address src) { 2596 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2597 Assembler::movdqu(dst, src); 2598 } 2599 2600 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) { 2601 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2602 Assembler::movdqu(dst, src); 2603 } 2604 2605 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) { 2606 assert(rscratch != noreg || always_reachable(src), "missing"); 2607 2608 if (reachable(src)) { 2609 movdqu(dst, as_Address(src)); 2610 } else { 2611 lea(rscratch, src); 2612 movdqu(dst, Address(rscratch, 0)); 2613 } 2614 } 2615 2616 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) { 2617 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2618 Assembler::vmovdqu(dst, src); 2619 } 2620 2621 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) { 2622 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2623 Assembler::vmovdqu(dst, src); 2624 } 2625 2626 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) { 2627 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2628 Assembler::vmovdqu(dst, src); 2629 } 2630 2631 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) { 2632 assert(rscratch != noreg || always_reachable(src), "missing"); 2633 2634 if (reachable(src)) { 2635 vmovdqu(dst, as_Address(src)); 2636 } 2637 else { 2638 lea(rscratch, src); 2639 vmovdqu(dst, Address(rscratch, 0)); 2640 } 2641 } 2642 2643 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2644 assert(rscratch != noreg || always_reachable(src), "missing"); 2645 2646 if (vector_len == AVX_512bit) { 2647 evmovdquq(dst, src, AVX_512bit, rscratch); 2648 } else if (vector_len == AVX_256bit) { 2649 vmovdqu(dst, src, rscratch); 2650 } else { 2651 movdqu(dst, src, rscratch); 2652 } 2653 } 2654 2655 void MacroAssembler::kmov(KRegister dst, Address src) { 2656 if (VM_Version::supports_avx512bw()) { 2657 kmovql(dst, src); 2658 } else { 2659 assert(VM_Version::supports_evex(), ""); 2660 kmovwl(dst, src); 2661 } 2662 } 2663 2664 void MacroAssembler::kmov(Address dst, KRegister src) { 2665 if (VM_Version::supports_avx512bw()) { 2666 kmovql(dst, src); 2667 } else { 2668 assert(VM_Version::supports_evex(), ""); 2669 kmovwl(dst, src); 2670 } 2671 } 2672 2673 void MacroAssembler::kmov(KRegister dst, KRegister src) { 2674 if (VM_Version::supports_avx512bw()) { 2675 kmovql(dst, src); 2676 } else { 2677 assert(VM_Version::supports_evex(), ""); 2678 kmovwl(dst, src); 2679 } 2680 } 2681 2682 void MacroAssembler::kmov(Register dst, KRegister src) { 2683 if (VM_Version::supports_avx512bw()) { 2684 kmovql(dst, src); 2685 } else { 2686 assert(VM_Version::supports_evex(), ""); 2687 kmovwl(dst, src); 2688 } 2689 } 2690 2691 void MacroAssembler::kmov(KRegister dst, Register src) { 2692 if (VM_Version::supports_avx512bw()) { 2693 kmovql(dst, src); 2694 } else { 2695 assert(VM_Version::supports_evex(), ""); 2696 kmovwl(dst, src); 2697 } 2698 } 2699 2700 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) { 2701 assert(rscratch != noreg || always_reachable(src), "missing"); 2702 2703 if (reachable(src)) { 2704 kmovql(dst, as_Address(src)); 2705 } else { 2706 lea(rscratch, src); 2707 kmovql(dst, Address(rscratch, 0)); 2708 } 2709 } 2710 2711 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) { 2712 assert(rscratch != noreg || always_reachable(src), "missing"); 2713 2714 if (reachable(src)) { 2715 kmovwl(dst, as_Address(src)); 2716 } else { 2717 lea(rscratch, src); 2718 kmovwl(dst, Address(rscratch, 0)); 2719 } 2720 } 2721 2722 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2723 int vector_len, Register rscratch) { 2724 assert(rscratch != noreg || always_reachable(src), "missing"); 2725 2726 if (reachable(src)) { 2727 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len); 2728 } else { 2729 lea(rscratch, src); 2730 Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len); 2731 } 2732 } 2733 2734 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2735 int vector_len, Register rscratch) { 2736 assert(rscratch != noreg || always_reachable(src), "missing"); 2737 2738 if (reachable(src)) { 2739 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len); 2740 } else { 2741 lea(rscratch, src); 2742 Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len); 2743 } 2744 } 2745 2746 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 2747 assert(rscratch != noreg || always_reachable(src), "missing"); 2748 2749 if (reachable(src)) { 2750 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len); 2751 } else { 2752 lea(rscratch, src); 2753 Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len); 2754 } 2755 } 2756 2757 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 2758 assert(rscratch != noreg || always_reachable(src), "missing"); 2759 2760 if (reachable(src)) { 2761 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len); 2762 } else { 2763 lea(rscratch, src); 2764 Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len); 2765 } 2766 } 2767 2768 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2769 assert(rscratch != noreg || always_reachable(src), "missing"); 2770 2771 if (reachable(src)) { 2772 Assembler::evmovdquq(dst, as_Address(src), vector_len); 2773 } else { 2774 lea(rscratch, src); 2775 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len); 2776 } 2777 } 2778 2779 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) { 2780 assert(rscratch != noreg || always_reachable(src), "missing"); 2781 2782 if (reachable(src)) { 2783 Assembler::movdqa(dst, as_Address(src)); 2784 } else { 2785 lea(rscratch, src); 2786 Assembler::movdqa(dst, Address(rscratch, 0)); 2787 } 2788 } 2789 2790 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2791 assert(rscratch != noreg || always_reachable(src), "missing"); 2792 2793 if (reachable(src)) { 2794 Assembler::movsd(dst, as_Address(src)); 2795 } else { 2796 lea(rscratch, src); 2797 Assembler::movsd(dst, Address(rscratch, 0)); 2798 } 2799 } 2800 2801 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) { 2802 assert(rscratch != noreg || always_reachable(src), "missing"); 2803 2804 if (reachable(src)) { 2805 Assembler::movss(dst, as_Address(src)); 2806 } else { 2807 lea(rscratch, src); 2808 Assembler::movss(dst, Address(rscratch, 0)); 2809 } 2810 } 2811 2812 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) { 2813 assert(rscratch != noreg || always_reachable(src), "missing"); 2814 2815 if (reachable(src)) { 2816 Assembler::movddup(dst, as_Address(src)); 2817 } else { 2818 lea(rscratch, src); 2819 Assembler::movddup(dst, Address(rscratch, 0)); 2820 } 2821 } 2822 2823 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2824 assert(rscratch != noreg || always_reachable(src), "missing"); 2825 2826 if (reachable(src)) { 2827 Assembler::vmovddup(dst, as_Address(src), vector_len); 2828 } else { 2829 lea(rscratch, src); 2830 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len); 2831 } 2832 } 2833 2834 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2835 assert(rscratch != noreg || always_reachable(src), "missing"); 2836 2837 if (reachable(src)) { 2838 Assembler::mulsd(dst, as_Address(src)); 2839 } else { 2840 lea(rscratch, src); 2841 Assembler::mulsd(dst, Address(rscratch, 0)); 2842 } 2843 } 2844 2845 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) { 2846 assert(rscratch != noreg || always_reachable(src), "missing"); 2847 2848 if (reachable(src)) { 2849 Assembler::mulss(dst, as_Address(src)); 2850 } else { 2851 lea(rscratch, src); 2852 Assembler::mulss(dst, Address(rscratch, 0)); 2853 } 2854 } 2855 2856 void MacroAssembler::null_check(Register reg, int offset) { 2857 if (needs_explicit_null_check(offset)) { 2858 // provoke OS NULL exception if reg = NULL by 2859 // accessing M[reg] w/o changing any (non-CC) registers 2860 // NOTE: cmpl is plenty here to provoke a segv 2861 cmpptr(rax, Address(reg, 0)); 2862 // Note: should probably use testl(rax, Address(reg, 0)); 2863 // may be shorter code (however, this version of 2864 // testl needs to be implemented first) 2865 } else { 2866 // nothing to do, (later) access of M[reg + offset] 2867 // will provoke OS NULL exception if reg = NULL 2868 } 2869 } 2870 2871 void MacroAssembler::os_breakpoint() { 2872 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability 2873 // (e.g., MSVC can't call ps() otherwise) 2874 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint))); 2875 } 2876 2877 void MacroAssembler::unimplemented(const char* what) { 2878 const char* buf = NULL; 2879 { 2880 ResourceMark rm; 2881 stringStream ss; 2882 ss.print("unimplemented: %s", what); 2883 buf = code_string(ss.as_string()); 2884 } 2885 stop(buf); 2886 } 2887 2888 #ifdef _LP64 2889 #define XSTATE_BV 0x200 2890 #endif 2891 2892 void MacroAssembler::pop_CPU_state() { 2893 pop_FPU_state(); 2894 pop_IU_state(); 2895 } 2896 2897 void MacroAssembler::pop_FPU_state() { 2898 #ifndef _LP64 2899 frstor(Address(rsp, 0)); 2900 #else 2901 fxrstor(Address(rsp, 0)); 2902 #endif 2903 addptr(rsp, FPUStateSizeInWords * wordSize); 2904 } 2905 2906 void MacroAssembler::pop_IU_state() { 2907 popa(); 2908 LP64_ONLY(addq(rsp, 8)); 2909 popf(); 2910 } 2911 2912 // Save Integer and Float state 2913 // Warning: Stack must be 16 byte aligned (64bit) 2914 void MacroAssembler::push_CPU_state() { 2915 push_IU_state(); 2916 push_FPU_state(); 2917 } 2918 2919 void MacroAssembler::push_FPU_state() { 2920 subptr(rsp, FPUStateSizeInWords * wordSize); 2921 #ifndef _LP64 2922 fnsave(Address(rsp, 0)); 2923 fwait(); 2924 #else 2925 fxsave(Address(rsp, 0)); 2926 #endif // LP64 2927 } 2928 2929 void MacroAssembler::push_IU_state() { 2930 // Push flags first because pusha kills them 2931 pushf(); 2932 // Make sure rsp stays 16-byte aligned 2933 LP64_ONLY(subq(rsp, 8)); 2934 pusha(); 2935 } 2936 2937 void MacroAssembler::push_cont_fastpath() { 2938 if (!Continuations::enabled()) return; 2939 2940 #ifndef _LP64 2941 Register rthread = rax; 2942 Register rrealsp = rbx; 2943 push(rthread); 2944 push(rrealsp); 2945 2946 get_thread(rthread); 2947 2948 // The code below wants the original RSP. 2949 // Move it back after the pushes above. 2950 movptr(rrealsp, rsp); 2951 addptr(rrealsp, 2*wordSize); 2952 #else 2953 Register rthread = r15_thread; 2954 Register rrealsp = rsp; 2955 #endif 2956 2957 Label done; 2958 cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset())); 2959 jccb(Assembler::belowEqual, done); 2960 movptr(Address(rthread, JavaThread::cont_fastpath_offset()), rrealsp); 2961 bind(done); 2962 2963 #ifndef _LP64 2964 pop(rrealsp); 2965 pop(rthread); 2966 #endif 2967 } 2968 2969 void MacroAssembler::pop_cont_fastpath() { 2970 if (!Continuations::enabled()) return; 2971 2972 #ifndef _LP64 2973 Register rthread = rax; 2974 Register rrealsp = rbx; 2975 push(rthread); 2976 push(rrealsp); 2977 2978 get_thread(rthread); 2979 2980 // The code below wants the original RSP. 2981 // Move it back after the pushes above. 2982 movptr(rrealsp, rsp); 2983 addptr(rrealsp, 2*wordSize); 2984 #else 2985 Register rthread = r15_thread; 2986 Register rrealsp = rsp; 2987 #endif 2988 2989 Label done; 2990 cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset())); 2991 jccb(Assembler::below, done); 2992 movptr(Address(rthread, JavaThread::cont_fastpath_offset()), 0); 2993 bind(done); 2994 2995 #ifndef _LP64 2996 pop(rrealsp); 2997 pop(rthread); 2998 #endif 2999 } 3000 3001 void MacroAssembler::inc_held_monitor_count() { 3002 #ifndef _LP64 3003 Register thread = rax; 3004 push(thread); 3005 get_thread(thread); 3006 incrementl(Address(thread, JavaThread::held_monitor_count_offset())); 3007 pop(thread); 3008 #else // LP64 3009 incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 3010 #endif 3011 } 3012 3013 void MacroAssembler::dec_held_monitor_count() { 3014 #ifndef _LP64 3015 Register thread = rax; 3016 push(thread); 3017 get_thread(thread); 3018 decrementl(Address(thread, JavaThread::held_monitor_count_offset())); 3019 pop(thread); 3020 #else // LP64 3021 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 3022 #endif 3023 } 3024 3025 #ifdef ASSERT 3026 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) { 3027 #ifdef _LP64 3028 Label no_cont; 3029 movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset())); 3030 testl(cont, cont); 3031 jcc(Assembler::zero, no_cont); 3032 stop(name); 3033 bind(no_cont); 3034 #else 3035 Unimplemented(); 3036 #endif 3037 } 3038 #endif 3039 3040 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register 3041 if (!java_thread->is_valid()) { 3042 java_thread = rdi; 3043 get_thread(java_thread); 3044 } 3045 // we must set sp to zero to clear frame 3046 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); 3047 // must clear fp, so that compiled frames are not confused; it is 3048 // possible that we need it only for debugging 3049 if (clear_fp) { 3050 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 3051 } 3052 // Always clear the pc because it could have been set by make_walkable() 3053 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); 3054 vzeroupper(); 3055 } 3056 3057 void MacroAssembler::restore_rax(Register tmp) { 3058 if (tmp == noreg) pop(rax); 3059 else if (tmp != rax) mov(rax, tmp); 3060 } 3061 3062 void MacroAssembler::round_to(Register reg, int modulus) { 3063 addptr(reg, modulus - 1); 3064 andptr(reg, -modulus); 3065 } 3066 3067 void MacroAssembler::save_rax(Register tmp) { 3068 if (tmp == noreg) push(rax); 3069 else if (tmp != rax) mov(tmp, rax); 3070 } 3071 3072 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) { 3073 if (at_return) { 3074 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore, 3075 // we may safely use rsp instead to perform the stack watermark check. 3076 cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset())); 3077 jcc(Assembler::above, slow_path); 3078 return; 3079 } 3080 testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit()); 3081 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll 3082 } 3083 3084 // Calls to C land 3085 // 3086 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded 3087 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 3088 // has to be reset to 0. This is required to allow proper stack traversal. 3089 void MacroAssembler::set_last_Java_frame(Register java_thread, 3090 Register last_java_sp, 3091 Register last_java_fp, 3092 address last_java_pc, 3093 Register rscratch) { 3094 vzeroupper(); 3095 // determine java_thread register 3096 if (!java_thread->is_valid()) { 3097 java_thread = rdi; 3098 get_thread(java_thread); 3099 } 3100 // determine last_java_sp register 3101 if (!last_java_sp->is_valid()) { 3102 last_java_sp = rsp; 3103 } 3104 // last_java_fp is optional 3105 if (last_java_fp->is_valid()) { 3106 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp); 3107 } 3108 // last_java_pc is optional 3109 if (last_java_pc != NULL) { 3110 Address java_pc(java_thread, 3111 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()); 3112 lea(java_pc, InternalAddress(last_java_pc), rscratch); 3113 } 3114 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp); 3115 } 3116 3117 void MacroAssembler::shlptr(Register dst, int imm8) { 3118 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8)); 3119 } 3120 3121 void MacroAssembler::shrptr(Register dst, int imm8) { 3122 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8)); 3123 } 3124 3125 void MacroAssembler::sign_extend_byte(Register reg) { 3126 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) { 3127 movsbl(reg, reg); // movsxb 3128 } else { 3129 shll(reg, 24); 3130 sarl(reg, 24); 3131 } 3132 } 3133 3134 void MacroAssembler::sign_extend_short(Register reg) { 3135 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 3136 movswl(reg, reg); // movsxw 3137 } else { 3138 shll(reg, 16); 3139 sarl(reg, 16); 3140 } 3141 } 3142 3143 void MacroAssembler::testl(Address dst, int32_t imm32) { 3144 if (imm32 >= 0 && is8bit(imm32)) { 3145 testb(dst, imm32); 3146 } else { 3147 Assembler::testl(dst, imm32); 3148 } 3149 } 3150 3151 void MacroAssembler::testl(Register dst, int32_t imm32) { 3152 if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) { 3153 testb(dst, imm32); 3154 } else { 3155 Assembler::testl(dst, imm32); 3156 } 3157 } 3158 3159 void MacroAssembler::testl(Register dst, AddressLiteral src) { 3160 assert(always_reachable(src), "Address should be reachable"); 3161 testl(dst, as_Address(src)); 3162 } 3163 3164 #ifdef _LP64 3165 3166 void MacroAssembler::testq(Address dst, int32_t imm32) { 3167 if (imm32 >= 0) { 3168 testl(dst, imm32); 3169 } else { 3170 Assembler::testq(dst, imm32); 3171 } 3172 } 3173 3174 void MacroAssembler::testq(Register dst, int32_t imm32) { 3175 if (imm32 >= 0) { 3176 testl(dst, imm32); 3177 } else { 3178 Assembler::testq(dst, imm32); 3179 } 3180 } 3181 3182 #endif 3183 3184 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) { 3185 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3186 Assembler::pcmpeqb(dst, src); 3187 } 3188 3189 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) { 3190 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3191 Assembler::pcmpeqw(dst, src); 3192 } 3193 3194 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) { 3195 assert((dst->encoding() < 16),"XMM register should be 0-15"); 3196 Assembler::pcmpestri(dst, src, imm8); 3197 } 3198 3199 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { 3200 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3201 Assembler::pcmpestri(dst, src, imm8); 3202 } 3203 3204 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) { 3205 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3206 Assembler::pmovzxbw(dst, src); 3207 } 3208 3209 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) { 3210 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3211 Assembler::pmovzxbw(dst, src); 3212 } 3213 3214 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) { 3215 assert((src->encoding() < 16),"XMM register should be 0-15"); 3216 Assembler::pmovmskb(dst, src); 3217 } 3218 3219 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) { 3220 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3221 Assembler::ptest(dst, src); 3222 } 3223 3224 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3225 assert(rscratch != noreg || always_reachable(src), "missing"); 3226 3227 if (reachable(src)) { 3228 Assembler::sqrtss(dst, as_Address(src)); 3229 } else { 3230 lea(rscratch, src); 3231 Assembler::sqrtss(dst, Address(rscratch, 0)); 3232 } 3233 } 3234 3235 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 3236 assert(rscratch != noreg || always_reachable(src), "missing"); 3237 3238 if (reachable(src)) { 3239 Assembler::subsd(dst, as_Address(src)); 3240 } else { 3241 lea(rscratch, src); 3242 Assembler::subsd(dst, Address(rscratch, 0)); 3243 } 3244 } 3245 3246 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) { 3247 assert(rscratch != noreg || always_reachable(src), "missing"); 3248 3249 if (reachable(src)) { 3250 Assembler::roundsd(dst, as_Address(src), rmode); 3251 } else { 3252 lea(rscratch, src); 3253 Assembler::roundsd(dst, Address(rscratch, 0), rmode); 3254 } 3255 } 3256 3257 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3258 assert(rscratch != noreg || always_reachable(src), "missing"); 3259 3260 if (reachable(src)) { 3261 Assembler::subss(dst, as_Address(src)); 3262 } else { 3263 lea(rscratch, src); 3264 Assembler::subss(dst, Address(rscratch, 0)); 3265 } 3266 } 3267 3268 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) { 3269 assert(rscratch != noreg || always_reachable(src), "missing"); 3270 3271 if (reachable(src)) { 3272 Assembler::ucomisd(dst, as_Address(src)); 3273 } else { 3274 lea(rscratch, src); 3275 Assembler::ucomisd(dst, Address(rscratch, 0)); 3276 } 3277 } 3278 3279 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3280 assert(rscratch != noreg || always_reachable(src), "missing"); 3281 3282 if (reachable(src)) { 3283 Assembler::ucomiss(dst, as_Address(src)); 3284 } else { 3285 lea(rscratch, src); 3286 Assembler::ucomiss(dst, Address(rscratch, 0)); 3287 } 3288 } 3289 3290 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 3291 assert(rscratch != noreg || always_reachable(src), "missing"); 3292 3293 // Used in sign-bit flipping with aligned address. 3294 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3295 if (reachable(src)) { 3296 Assembler::xorpd(dst, as_Address(src)); 3297 } else { 3298 lea(rscratch, src); 3299 Assembler::xorpd(dst, Address(rscratch, 0)); 3300 } 3301 } 3302 3303 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) { 3304 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) { 3305 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 3306 } 3307 else { 3308 Assembler::xorpd(dst, src); 3309 } 3310 } 3311 3312 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) { 3313 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) { 3314 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 3315 } else { 3316 Assembler::xorps(dst, src); 3317 } 3318 } 3319 3320 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) { 3321 assert(rscratch != noreg || always_reachable(src), "missing"); 3322 3323 // Used in sign-bit flipping with aligned address. 3324 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3325 if (reachable(src)) { 3326 Assembler::xorps(dst, as_Address(src)); 3327 } else { 3328 lea(rscratch, src); 3329 Assembler::xorps(dst, Address(rscratch, 0)); 3330 } 3331 } 3332 3333 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) { 3334 assert(rscratch != noreg || always_reachable(src), "missing"); 3335 3336 // Used in sign-bit flipping with aligned address. 3337 bool aligned_adr = (((intptr_t)src.target() & 15) == 0); 3338 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes"); 3339 if (reachable(src)) { 3340 Assembler::pshufb(dst, as_Address(src)); 3341 } else { 3342 lea(rscratch, src); 3343 Assembler::pshufb(dst, Address(rscratch, 0)); 3344 } 3345 } 3346 3347 // AVX 3-operands instructions 3348 3349 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3350 assert(rscratch != noreg || always_reachable(src), "missing"); 3351 3352 if (reachable(src)) { 3353 vaddsd(dst, nds, as_Address(src)); 3354 } else { 3355 lea(rscratch, src); 3356 vaddsd(dst, nds, Address(rscratch, 0)); 3357 } 3358 } 3359 3360 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3361 assert(rscratch != noreg || always_reachable(src), "missing"); 3362 3363 if (reachable(src)) { 3364 vaddss(dst, nds, as_Address(src)); 3365 } else { 3366 lea(rscratch, src); 3367 vaddss(dst, nds, Address(rscratch, 0)); 3368 } 3369 } 3370 3371 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3372 assert(UseAVX > 0, "requires some form of AVX"); 3373 assert(rscratch != noreg || always_reachable(src), "missing"); 3374 3375 if (reachable(src)) { 3376 Assembler::vpaddb(dst, nds, as_Address(src), vector_len); 3377 } else { 3378 lea(rscratch, src); 3379 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len); 3380 } 3381 } 3382 3383 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3384 assert(UseAVX > 0, "requires some form of AVX"); 3385 assert(rscratch != noreg || always_reachable(src), "missing"); 3386 3387 if (reachable(src)) { 3388 Assembler::vpaddd(dst, nds, as_Address(src), vector_len); 3389 } else { 3390 lea(rscratch, src); 3391 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len); 3392 } 3393 } 3394 3395 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) { 3396 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3397 assert(rscratch != noreg || always_reachable(negate_field), "missing"); 3398 3399 vandps(dst, nds, negate_field, vector_len, rscratch); 3400 } 3401 3402 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) { 3403 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3404 assert(rscratch != noreg || always_reachable(negate_field), "missing"); 3405 3406 vandpd(dst, nds, negate_field, vector_len, rscratch); 3407 } 3408 3409 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3410 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3411 Assembler::vpaddb(dst, nds, src, vector_len); 3412 } 3413 3414 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3415 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3416 Assembler::vpaddb(dst, nds, src, vector_len); 3417 } 3418 3419 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3420 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3421 Assembler::vpaddw(dst, nds, src, vector_len); 3422 } 3423 3424 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3425 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3426 Assembler::vpaddw(dst, nds, src, vector_len); 3427 } 3428 3429 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3430 assert(rscratch != noreg || always_reachable(src), "missing"); 3431 3432 if (reachable(src)) { 3433 Assembler::vpand(dst, nds, as_Address(src), vector_len); 3434 } else { 3435 lea(rscratch, src); 3436 Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len); 3437 } 3438 } 3439 3440 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3441 assert(rscratch != noreg || always_reachable(src), "missing"); 3442 3443 if (reachable(src)) { 3444 Assembler::vpbroadcastd(dst, as_Address(src), vector_len); 3445 } else { 3446 lea(rscratch, src); 3447 Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len); 3448 } 3449 } 3450 3451 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3452 assert(rscratch != noreg || always_reachable(src), "missing"); 3453 3454 if (reachable(src)) { 3455 Assembler::vpbroadcastq(dst, as_Address(src), vector_len); 3456 } else { 3457 lea(rscratch, src); 3458 Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len); 3459 } 3460 } 3461 3462 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3463 assert(rscratch != noreg || always_reachable(src), "missing"); 3464 3465 if (reachable(src)) { 3466 Assembler::vbroadcastsd(dst, as_Address(src), vector_len); 3467 } else { 3468 lea(rscratch, src); 3469 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len); 3470 } 3471 } 3472 3473 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3474 assert(rscratch != noreg || always_reachable(src), "missing"); 3475 3476 if (reachable(src)) { 3477 Assembler::vbroadcastss(dst, as_Address(src), vector_len); 3478 } else { 3479 lea(rscratch, src); 3480 Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len); 3481 } 3482 } 3483 3484 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3485 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3486 Assembler::vpcmpeqb(dst, nds, src, vector_len); 3487 } 3488 3489 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3490 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3491 Assembler::vpcmpeqw(dst, nds, src, vector_len); 3492 } 3493 3494 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3495 assert(rscratch != noreg || always_reachable(src), "missing"); 3496 3497 if (reachable(src)) { 3498 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len); 3499 } else { 3500 lea(rscratch, src); 3501 Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len); 3502 } 3503 } 3504 3505 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3506 int comparison, bool is_signed, int vector_len, Register rscratch) { 3507 assert(rscratch != noreg || always_reachable(src), "missing"); 3508 3509 if (reachable(src)) { 3510 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3511 } else { 3512 lea(rscratch, src); 3513 Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3514 } 3515 } 3516 3517 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3518 int comparison, bool is_signed, int vector_len, Register rscratch) { 3519 assert(rscratch != noreg || always_reachable(src), "missing"); 3520 3521 if (reachable(src)) { 3522 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3523 } else { 3524 lea(rscratch, src); 3525 Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3526 } 3527 } 3528 3529 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3530 int comparison, bool is_signed, int vector_len, Register rscratch) { 3531 assert(rscratch != noreg || always_reachable(src), "missing"); 3532 3533 if (reachable(src)) { 3534 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3535 } else { 3536 lea(rscratch, src); 3537 Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3538 } 3539 } 3540 3541 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3542 int comparison, bool is_signed, int vector_len, Register rscratch) { 3543 assert(rscratch != noreg || always_reachable(src), "missing"); 3544 3545 if (reachable(src)) { 3546 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3547 } else { 3548 lea(rscratch, src); 3549 Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3550 } 3551 } 3552 3553 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) { 3554 if (width == Assembler::Q) { 3555 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len); 3556 } else { 3557 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len); 3558 } 3559 } 3560 3561 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) { 3562 int eq_cond_enc = 0x29; 3563 int gt_cond_enc = 0x37; 3564 if (width != Assembler::Q) { 3565 eq_cond_enc = 0x74 + width; 3566 gt_cond_enc = 0x64 + width; 3567 } 3568 switch (cond) { 3569 case eq: 3570 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); 3571 break; 3572 case neq: 3573 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); 3574 vallones(xtmp, vector_len); 3575 vpxor(dst, xtmp, dst, vector_len); 3576 break; 3577 case le: 3578 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); 3579 vallones(xtmp, vector_len); 3580 vpxor(dst, xtmp, dst, vector_len); 3581 break; 3582 case nlt: 3583 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); 3584 vallones(xtmp, vector_len); 3585 vpxor(dst, xtmp, dst, vector_len); 3586 break; 3587 case lt: 3588 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); 3589 break; 3590 case nle: 3591 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); 3592 break; 3593 default: 3594 assert(false, "Should not reach here"); 3595 } 3596 } 3597 3598 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { 3599 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3600 Assembler::vpmovzxbw(dst, src, vector_len); 3601 } 3602 3603 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) { 3604 assert((src->encoding() < 16),"XMM register should be 0-15"); 3605 Assembler::vpmovmskb(dst, src, vector_len); 3606 } 3607 3608 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3609 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3610 Assembler::vpmullw(dst, nds, src, vector_len); 3611 } 3612 3613 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3614 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3615 Assembler::vpmullw(dst, nds, src, vector_len); 3616 } 3617 3618 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3619 assert((UseAVX > 0), "AVX support is needed"); 3620 assert(rscratch != noreg || always_reachable(src), "missing"); 3621 3622 if (reachable(src)) { 3623 Assembler::vpmulld(dst, nds, as_Address(src), vector_len); 3624 } else { 3625 lea(rscratch, src); 3626 Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len); 3627 } 3628 } 3629 3630 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3631 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3632 Assembler::vpsubb(dst, nds, src, vector_len); 3633 } 3634 3635 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3636 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3637 Assembler::vpsubb(dst, nds, src, vector_len); 3638 } 3639 3640 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3641 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3642 Assembler::vpsubw(dst, nds, src, vector_len); 3643 } 3644 3645 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3646 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3647 Assembler::vpsubw(dst, nds, src, vector_len); 3648 } 3649 3650 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3651 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3652 Assembler::vpsraw(dst, nds, shift, vector_len); 3653 } 3654 3655 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3656 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3657 Assembler::vpsraw(dst, nds, shift, vector_len); 3658 } 3659 3660 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3661 assert(UseAVX > 2,""); 3662 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 3663 vector_len = 2; 3664 } 3665 Assembler::evpsraq(dst, nds, shift, vector_len); 3666 } 3667 3668 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3669 assert(UseAVX > 2,""); 3670 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 3671 vector_len = 2; 3672 } 3673 Assembler::evpsraq(dst, nds, shift, vector_len); 3674 } 3675 3676 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3677 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3678 Assembler::vpsrlw(dst, nds, shift, vector_len); 3679 } 3680 3681 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3682 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3683 Assembler::vpsrlw(dst, nds, shift, vector_len); 3684 } 3685 3686 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3687 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3688 Assembler::vpsllw(dst, nds, shift, vector_len); 3689 } 3690 3691 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3692 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3693 Assembler::vpsllw(dst, nds, shift, vector_len); 3694 } 3695 3696 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) { 3697 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3698 Assembler::vptest(dst, src); 3699 } 3700 3701 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) { 3702 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3703 Assembler::punpcklbw(dst, src); 3704 } 3705 3706 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) { 3707 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 3708 Assembler::pshufd(dst, src, mode); 3709 } 3710 3711 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { 3712 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3713 Assembler::pshuflw(dst, src, mode); 3714 } 3715 3716 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3717 assert(rscratch != noreg || always_reachable(src), "missing"); 3718 3719 if (reachable(src)) { 3720 vandpd(dst, nds, as_Address(src), vector_len); 3721 } else { 3722 lea(rscratch, src); 3723 vandpd(dst, nds, Address(rscratch, 0), vector_len); 3724 } 3725 } 3726 3727 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3728 assert(rscratch != noreg || always_reachable(src), "missing"); 3729 3730 if (reachable(src)) { 3731 vandps(dst, nds, as_Address(src), vector_len); 3732 } else { 3733 lea(rscratch, src); 3734 vandps(dst, nds, Address(rscratch, 0), vector_len); 3735 } 3736 } 3737 3738 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, 3739 bool merge, int vector_len, Register rscratch) { 3740 assert(rscratch != noreg || always_reachable(src), "missing"); 3741 3742 if (reachable(src)) { 3743 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len); 3744 } else { 3745 lea(rscratch, src); 3746 Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len); 3747 } 3748 } 3749 3750 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3751 assert(rscratch != noreg || always_reachable(src), "missing"); 3752 3753 if (reachable(src)) { 3754 vdivsd(dst, nds, as_Address(src)); 3755 } else { 3756 lea(rscratch, src); 3757 vdivsd(dst, nds, Address(rscratch, 0)); 3758 } 3759 } 3760 3761 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3762 assert(rscratch != noreg || always_reachable(src), "missing"); 3763 3764 if (reachable(src)) { 3765 vdivss(dst, nds, as_Address(src)); 3766 } else { 3767 lea(rscratch, src); 3768 vdivss(dst, nds, Address(rscratch, 0)); 3769 } 3770 } 3771 3772 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3773 assert(rscratch != noreg || always_reachable(src), "missing"); 3774 3775 if (reachable(src)) { 3776 vmulsd(dst, nds, as_Address(src)); 3777 } else { 3778 lea(rscratch, src); 3779 vmulsd(dst, nds, Address(rscratch, 0)); 3780 } 3781 } 3782 3783 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3784 assert(rscratch != noreg || always_reachable(src), "missing"); 3785 3786 if (reachable(src)) { 3787 vmulss(dst, nds, as_Address(src)); 3788 } else { 3789 lea(rscratch, src); 3790 vmulss(dst, nds, Address(rscratch, 0)); 3791 } 3792 } 3793 3794 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3795 assert(rscratch != noreg || always_reachable(src), "missing"); 3796 3797 if (reachable(src)) { 3798 vsubsd(dst, nds, as_Address(src)); 3799 } else { 3800 lea(rscratch, src); 3801 vsubsd(dst, nds, Address(rscratch, 0)); 3802 } 3803 } 3804 3805 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3806 assert(rscratch != noreg || always_reachable(src), "missing"); 3807 3808 if (reachable(src)) { 3809 vsubss(dst, nds, as_Address(src)); 3810 } else { 3811 lea(rscratch, src); 3812 vsubss(dst, nds, Address(rscratch, 0)); 3813 } 3814 } 3815 3816 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3817 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3818 assert(rscratch != noreg || always_reachable(src), "missing"); 3819 3820 vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch); 3821 } 3822 3823 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3824 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3825 assert(rscratch != noreg || always_reachable(src), "missing"); 3826 3827 vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch); 3828 } 3829 3830 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3831 assert(rscratch != noreg || always_reachable(src), "missing"); 3832 3833 if (reachable(src)) { 3834 vxorpd(dst, nds, as_Address(src), vector_len); 3835 } else { 3836 lea(rscratch, src); 3837 vxorpd(dst, nds, Address(rscratch, 0), vector_len); 3838 } 3839 } 3840 3841 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3842 assert(rscratch != noreg || always_reachable(src), "missing"); 3843 3844 if (reachable(src)) { 3845 vxorps(dst, nds, as_Address(src), vector_len); 3846 } else { 3847 lea(rscratch, src); 3848 vxorps(dst, nds, Address(rscratch, 0), vector_len); 3849 } 3850 } 3851 3852 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3853 assert(rscratch != noreg || always_reachable(src), "missing"); 3854 3855 if (UseAVX > 1 || (vector_len < 1)) { 3856 if (reachable(src)) { 3857 Assembler::vpxor(dst, nds, as_Address(src), vector_len); 3858 } else { 3859 lea(rscratch, src); 3860 Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len); 3861 } 3862 } else { 3863 MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch); 3864 } 3865 } 3866 3867 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3868 assert(rscratch != noreg || always_reachable(src), "missing"); 3869 3870 if (reachable(src)) { 3871 Assembler::vpermd(dst, nds, as_Address(src), vector_len); 3872 } else { 3873 lea(rscratch, src); 3874 Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len); 3875 } 3876 } 3877 3878 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) { 3879 const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask); 3880 STATIC_ASSERT(inverted_mask == -4); // otherwise check this code 3881 // The inverted mask is sign-extended 3882 andptr(possibly_non_local, inverted_mask); 3883 } 3884 3885 void MacroAssembler::resolve_jobject(Register value, 3886 Register thread, 3887 Register tmp) { 3888 assert_different_registers(value, thread, tmp); 3889 Label done, tagged, weak_tagged; 3890 testptr(value, value); 3891 jcc(Assembler::zero, done); // Use NULL as-is. 3892 testptr(value, JNIHandles::tag_mask); // Test for tag. 3893 jcc(Assembler::notZero, tagged); 3894 3895 // Resolve local handle 3896 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp, thread); 3897 verify_oop(value); 3898 jmp(done); 3899 3900 bind(tagged); 3901 testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag. 3902 jcc(Assembler::notZero, weak_tagged); 3903 3904 // Resolve global handle 3905 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread); 3906 verify_oop(value); 3907 jmp(done); 3908 3909 bind(weak_tagged); 3910 // Resolve jweak. 3911 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 3912 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp, thread); 3913 verify_oop(value); 3914 3915 bind(done); 3916 } 3917 3918 void MacroAssembler::resolve_global_jobject(Register value, 3919 Register thread, 3920 Register tmp) { 3921 assert_different_registers(value, thread, tmp); 3922 Label done; 3923 3924 testptr(value, value); 3925 jcc(Assembler::zero, done); // Use NULL as-is. 3926 3927 #ifdef ASSERT 3928 { 3929 Label valid_global_tag; 3930 testptr(value, JNIHandles::TypeTag::global); // Test for global tag. 3931 jcc(Assembler::notZero, valid_global_tag); 3932 stop("non global jobject using resolve_global_jobject"); 3933 bind(valid_global_tag); 3934 } 3935 #endif 3936 3937 // Resolve global handle 3938 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread); 3939 verify_oop(value); 3940 3941 bind(done); 3942 } 3943 3944 void MacroAssembler::subptr(Register dst, int32_t imm32) { 3945 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32)); 3946 } 3947 3948 // Force generation of a 4 byte immediate value even if it fits into 8bit 3949 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) { 3950 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32)); 3951 } 3952 3953 void MacroAssembler::subptr(Register dst, Register src) { 3954 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); 3955 } 3956 3957 // C++ bool manipulation 3958 void MacroAssembler::testbool(Register dst) { 3959 if(sizeof(bool) == 1) 3960 testb(dst, 0xff); 3961 else if(sizeof(bool) == 2) { 3962 // testw implementation needed for two byte bools 3963 ShouldNotReachHere(); 3964 } else if(sizeof(bool) == 4) 3965 testl(dst, dst); 3966 else 3967 // unsupported 3968 ShouldNotReachHere(); 3969 } 3970 3971 void MacroAssembler::testptr(Register dst, Register src) { 3972 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src)); 3973 } 3974 3975 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 3976 void MacroAssembler::tlab_allocate(Register thread, Register obj, 3977 Register var_size_in_bytes, 3978 int con_size_in_bytes, 3979 Register t1, 3980 Register t2, 3981 Label& slow_case) { 3982 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3983 bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 3984 } 3985 3986 RegSet MacroAssembler::call_clobbered_gp_registers() { 3987 RegSet regs; 3988 #ifdef _LP64 3989 regs += RegSet::of(rax, rcx, rdx); 3990 #ifndef WINDOWS 3991 regs += RegSet::of(rsi, rdi); 3992 #endif 3993 regs += RegSet::range(r8, r11); 3994 #else 3995 regs += RegSet::of(rax, rcx, rdx); 3996 #endif 3997 return regs; 3998 } 3999 4000 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() { 4001 int num_xmm_registers = XMMRegister::available_xmm_registers(); 4002 #if defined(WINDOWS) && defined(_LP64) 4003 XMMRegSet result = XMMRegSet::range(xmm0, xmm5); 4004 if (num_xmm_registers > 16) { 4005 result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1)); 4006 } 4007 return result; 4008 #else 4009 return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1)); 4010 #endif 4011 } 4012 4013 static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor 4014 4015 #ifndef _LP64 4016 static bool use_x87_registers() { return UseSSE < 2; } 4017 #endif 4018 static bool use_xmm_registers() { return UseSSE >= 1; } 4019 4020 // C1 only ever uses the first double/float of the XMM register. 4021 static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); } 4022 4023 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) { 4024 if (UseSSE == 1) { 4025 masm->movflt(Address(rsp, offset), reg); 4026 } else { 4027 masm->movdbl(Address(rsp, offset), reg); 4028 } 4029 } 4030 4031 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) { 4032 if (UseSSE == 1) { 4033 masm->movflt(reg, Address(rsp, offset)); 4034 } else { 4035 masm->movdbl(reg, Address(rsp, offset)); 4036 } 4037 } 4038 4039 int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers, bool save_fpu, 4040 int& gp_area_size, int& fp_area_size, int& xmm_area_size) { 4041 4042 gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size, 4043 StackAlignmentInBytes); 4044 #ifdef _LP64 4045 fp_area_size = 0; 4046 #else 4047 fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0; 4048 #endif 4049 xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0; 4050 4051 return gp_area_size + fp_area_size + xmm_area_size; 4052 } 4053 4054 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) { 4055 block_comment("push_call_clobbered_registers start"); 4056 // Regular registers 4057 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude; 4058 4059 int gp_area_size; 4060 int fp_area_size; 4061 int xmm_area_size; 4062 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu, 4063 gp_area_size, fp_area_size, xmm_area_size); 4064 subptr(rsp, total_save_size); 4065 4066 push_set(gp_registers_to_push, 0); 4067 4068 #ifndef _LP64 4069 if (save_fpu && use_x87_registers()) { 4070 fnsave(Address(rsp, gp_area_size)); 4071 fwait(); 4072 } 4073 #endif 4074 if (save_fpu && use_xmm_registers()) { 4075 push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size); 4076 } 4077 4078 block_comment("push_call_clobbered_registers end"); 4079 } 4080 4081 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) { 4082 block_comment("pop_call_clobbered_registers start"); 4083 4084 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude; 4085 4086 int gp_area_size; 4087 int fp_area_size; 4088 int xmm_area_size; 4089 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu, 4090 gp_area_size, fp_area_size, xmm_area_size); 4091 4092 if (restore_fpu && use_xmm_registers()) { 4093 pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size); 4094 } 4095 #ifndef _LP64 4096 if (restore_fpu && use_x87_registers()) { 4097 frstor(Address(rsp, gp_area_size)); 4098 } 4099 #endif 4100 4101 pop_set(gp_registers_to_pop, 0); 4102 4103 addptr(rsp, total_save_size); 4104 4105 vzeroupper(); 4106 4107 block_comment("pop_call_clobbered_registers end"); 4108 } 4109 4110 void MacroAssembler::push_set(XMMRegSet set, int offset) { 4111 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be"); 4112 int spill_offset = offset; 4113 4114 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) { 4115 save_xmm_register(this, spill_offset, *it); 4116 spill_offset += xmm_save_size(); 4117 } 4118 } 4119 4120 void MacroAssembler::pop_set(XMMRegSet set, int offset) { 4121 int restore_size = set.size() * xmm_save_size(); 4122 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be"); 4123 4124 int restore_offset = offset + restore_size - xmm_save_size(); 4125 4126 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) { 4127 restore_xmm_register(this, restore_offset, *it); 4128 restore_offset -= xmm_save_size(); 4129 } 4130 } 4131 4132 void MacroAssembler::push_set(RegSet set, int offset) { 4133 int spill_offset; 4134 if (offset == -1) { 4135 int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4136 int aligned_size = align_up(register_push_size, StackAlignmentInBytes); 4137 subptr(rsp, aligned_size); 4138 spill_offset = 0; 4139 } else { 4140 spill_offset = offset; 4141 } 4142 4143 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) { 4144 movptr(Address(rsp, spill_offset), *it); 4145 spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4146 } 4147 } 4148 4149 void MacroAssembler::pop_set(RegSet set, int offset) { 4150 4151 int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4152 int restore_size = set.size() * gp_reg_size; 4153 int aligned_size = align_up(restore_size, StackAlignmentInBytes); 4154 4155 int restore_offset; 4156 if (offset == -1) { 4157 restore_offset = restore_size - gp_reg_size; 4158 } else { 4159 restore_offset = offset + restore_size - gp_reg_size; 4160 } 4161 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) { 4162 movptr(*it, Address(rsp, restore_offset)); 4163 restore_offset -= gp_reg_size; 4164 } 4165 4166 if (offset == -1) { 4167 addptr(rsp, aligned_size); 4168 } 4169 } 4170 4171 // Preserves the contents of address, destroys the contents length_in_bytes and temp. 4172 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) { 4173 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different"); 4174 assert((offset_in_bytes & (BytesPerInt - 1)) == 0, "offset must be a multiple of BytesPerInt"); 4175 Label done; 4176 4177 testptr(length_in_bytes, length_in_bytes); 4178 jcc(Assembler::zero, done); 4179 4180 // Emit single 32bit store to clear leading bytes, if necessary. 4181 xorptr(temp, temp); // use _zero reg to clear memory (shorter code) 4182 #ifdef _LP64 4183 if (!is_aligned(offset_in_bytes, BytesPerWord)) { 4184 movl(Address(address, offset_in_bytes), temp); 4185 offset_in_bytes += BytesPerInt; 4186 decrement(length_in_bytes, BytesPerInt); 4187 } 4188 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord"); 4189 testptr(length_in_bytes, length_in_bytes); 4190 jcc(Assembler::zero, done); 4191 #endif 4192 4193 // initialize topmost word, divide index by 2, check if odd and test if zero 4194 // note: for the remaining code to work, index must be a multiple of BytesPerWord 4195 #ifdef ASSERT 4196 { 4197 Label L; 4198 testptr(length_in_bytes, BytesPerWord - 1); 4199 jcc(Assembler::zero, L); 4200 stop("length must be a multiple of BytesPerWord"); 4201 bind(L); 4202 } 4203 #endif 4204 Register index = length_in_bytes; 4205 if (UseIncDec) { 4206 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set 4207 } else { 4208 shrptr(index, 2); // use 2 instructions to avoid partial flag stall 4209 shrptr(index, 1); 4210 } 4211 #ifndef _LP64 4212 // index could have not been a multiple of 8 (i.e., bit 2 was set) 4213 { 4214 Label even; 4215 // note: if index was a multiple of 8, then it cannot 4216 // be 0 now otherwise it must have been 0 before 4217 // => if it is even, we don't need to check for 0 again 4218 jcc(Assembler::carryClear, even); 4219 // clear topmost word (no jump would be needed if conditional assignment worked here) 4220 movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp); 4221 // index could be 0 now, must check again 4222 jcc(Assembler::zero, done); 4223 bind(even); 4224 } 4225 #endif // !_LP64 4226 // initialize remaining object fields: index is a multiple of 2 now 4227 { 4228 Label loop; 4229 bind(loop); 4230 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp); 4231 NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);) 4232 decrement(index); 4233 jcc(Assembler::notZero, loop); 4234 } 4235 4236 bind(done); 4237 } 4238 4239 // Look up the method for a megamorphic invokeinterface call. 4240 // The target method is determined by <intf_klass, itable_index>. 4241 // The receiver klass is in recv_klass. 4242 // On success, the result will be in method_result, and execution falls through. 4243 // On failure, execution transfers to the given label. 4244 void MacroAssembler::lookup_interface_method(Register recv_klass, 4245 Register intf_klass, 4246 RegisterOrConstant itable_index, 4247 Register method_result, 4248 Register scan_temp, 4249 Label& L_no_such_interface, 4250 bool return_method) { 4251 assert_different_registers(recv_klass, intf_klass, scan_temp); 4252 assert_different_registers(method_result, intf_klass, scan_temp); 4253 assert(recv_klass != method_result || !return_method, 4254 "recv_klass can be destroyed when method isn't needed"); 4255 4256 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 4257 "caller must use same register for non-constant itable index as for method"); 4258 4259 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 4260 int vtable_base = in_bytes(Klass::vtable_start_offset()); 4261 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 4262 int scan_step = itableOffsetEntry::size() * wordSize; 4263 int vte_size = vtableEntry::size_in_bytes(); 4264 Address::ScaleFactor times_vte_scale = Address::times_ptr; 4265 assert(vte_size == wordSize, "else adjust times_vte_scale"); 4266 4267 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 4268 4269 // %%% Could store the aligned, prescaled offset in the klassoop. 4270 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 4271 4272 if (return_method) { 4273 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 4274 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 4275 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 4276 } 4277 4278 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 4279 // if (scan->interface() == intf) { 4280 // result = (klass + scan->offset() + itable_index); 4281 // } 4282 // } 4283 Label search, found_method; 4284 4285 for (int peel = 1; peel >= 0; peel--) { 4286 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 4287 cmpptr(intf_klass, method_result); 4288 4289 if (peel) { 4290 jccb(Assembler::equal, found_method); 4291 } else { 4292 jccb(Assembler::notEqual, search); 4293 // (invert the test to fall through to found_method...) 4294 } 4295 4296 if (!peel) break; 4297 4298 bind(search); 4299 4300 // Check that the previous entry is non-null. A null entry means that 4301 // the receiver class doesn't implement the interface, and wasn't the 4302 // same as when the caller was compiled. 4303 testptr(method_result, method_result); 4304 jcc(Assembler::zero, L_no_such_interface); 4305 addptr(scan_temp, scan_step); 4306 } 4307 4308 bind(found_method); 4309 4310 if (return_method) { 4311 // Got a hit. 4312 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 4313 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1)); 4314 } 4315 } 4316 4317 4318 // virtual method calling 4319 void MacroAssembler::lookup_virtual_method(Register recv_klass, 4320 RegisterOrConstant vtable_index, 4321 Register method_result) { 4322 const int base = in_bytes(Klass::vtable_start_offset()); 4323 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below"); 4324 Address vtable_entry_addr(recv_klass, 4325 vtable_index, Address::times_ptr, 4326 base + vtableEntry::method_offset_in_bytes()); 4327 movptr(method_result, vtable_entry_addr); 4328 } 4329 4330 4331 void MacroAssembler::check_klass_subtype(Register sub_klass, 4332 Register super_klass, 4333 Register temp_reg, 4334 Label& L_success) { 4335 Label L_failure; 4336 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 4337 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 4338 bind(L_failure); 4339 } 4340 4341 4342 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 4343 Register super_klass, 4344 Register temp_reg, 4345 Label* L_success, 4346 Label* L_failure, 4347 Label* L_slow_path, 4348 RegisterOrConstant super_check_offset) { 4349 assert_different_registers(sub_klass, super_klass, temp_reg); 4350 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 4351 if (super_check_offset.is_register()) { 4352 assert_different_registers(sub_klass, super_klass, 4353 super_check_offset.as_register()); 4354 } else if (must_load_sco) { 4355 assert(temp_reg != noreg, "supply either a temp or a register offset"); 4356 } 4357 4358 Label L_fallthrough; 4359 int label_nulls = 0; 4360 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 4361 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 4362 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 4363 assert(label_nulls <= 1, "at most one NULL in the batch"); 4364 4365 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 4366 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 4367 Address super_check_offset_addr(super_klass, sco_offset); 4368 4369 // Hacked jcc, which "knows" that L_fallthrough, at least, is in 4370 // range of a jccb. If this routine grows larger, reconsider at 4371 // least some of these. 4372 #define local_jcc(assembler_cond, label) \ 4373 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \ 4374 else jcc( assembler_cond, label) /*omit semi*/ 4375 4376 // Hacked jmp, which may only be used just before L_fallthrough. 4377 #define final_jmp(label) \ 4378 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 4379 else jmp(label) /*omit semi*/ 4380 4381 // If the pointers are equal, we are done (e.g., String[] elements). 4382 // This self-check enables sharing of secondary supertype arrays among 4383 // non-primary types such as array-of-interface. Otherwise, each such 4384 // type would need its own customized SSA. 4385 // We move this check to the front of the fast path because many 4386 // type checks are in fact trivially successful in this manner, 4387 // so we get a nicely predicted branch right at the start of the check. 4388 cmpptr(sub_klass, super_klass); 4389 local_jcc(Assembler::equal, *L_success); 4390 4391 // Check the supertype display: 4392 if (must_load_sco) { 4393 // Positive movl does right thing on LP64. 4394 movl(temp_reg, super_check_offset_addr); 4395 super_check_offset = RegisterOrConstant(temp_reg); 4396 } 4397 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0); 4398 cmpptr(super_klass, super_check_addr); // load displayed supertype 4399 4400 // This check has worked decisively for primary supers. 4401 // Secondary supers are sought in the super_cache ('super_cache_addr'). 4402 // (Secondary supers are interfaces and very deeply nested subtypes.) 4403 // This works in the same check above because of a tricky aliasing 4404 // between the super_cache and the primary super display elements. 4405 // (The 'super_check_addr' can address either, as the case requires.) 4406 // Note that the cache is updated below if it does not help us find 4407 // what we need immediately. 4408 // So if it was a primary super, we can just fail immediately. 4409 // Otherwise, it's the slow path for us (no success at this point). 4410 4411 if (super_check_offset.is_register()) { 4412 local_jcc(Assembler::equal, *L_success); 4413 cmpl(super_check_offset.as_register(), sc_offset); 4414 if (L_failure == &L_fallthrough) { 4415 local_jcc(Assembler::equal, *L_slow_path); 4416 } else { 4417 local_jcc(Assembler::notEqual, *L_failure); 4418 final_jmp(*L_slow_path); 4419 } 4420 } else if (super_check_offset.as_constant() == sc_offset) { 4421 // Need a slow path; fast failure is impossible. 4422 if (L_slow_path == &L_fallthrough) { 4423 local_jcc(Assembler::equal, *L_success); 4424 } else { 4425 local_jcc(Assembler::notEqual, *L_slow_path); 4426 final_jmp(*L_success); 4427 } 4428 } else { 4429 // No slow path; it's a fast decision. 4430 if (L_failure == &L_fallthrough) { 4431 local_jcc(Assembler::equal, *L_success); 4432 } else { 4433 local_jcc(Assembler::notEqual, *L_failure); 4434 final_jmp(*L_success); 4435 } 4436 } 4437 4438 bind(L_fallthrough); 4439 4440 #undef local_jcc 4441 #undef final_jmp 4442 } 4443 4444 4445 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 4446 Register super_klass, 4447 Register temp_reg, 4448 Register temp2_reg, 4449 Label* L_success, 4450 Label* L_failure, 4451 bool set_cond_codes) { 4452 assert_different_registers(sub_klass, super_klass, temp_reg); 4453 if (temp2_reg != noreg) 4454 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg); 4455 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 4456 4457 Label L_fallthrough; 4458 int label_nulls = 0; 4459 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 4460 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 4461 assert(label_nulls <= 1, "at most one NULL in the batch"); 4462 4463 // a couple of useful fields in sub_klass: 4464 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 4465 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 4466 Address secondary_supers_addr(sub_klass, ss_offset); 4467 Address super_cache_addr( sub_klass, sc_offset); 4468 4469 // Do a linear scan of the secondary super-klass chain. 4470 // This code is rarely used, so simplicity is a virtue here. 4471 // The repne_scan instruction uses fixed registers, which we must spill. 4472 // Don't worry too much about pre-existing connections with the input regs. 4473 4474 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super) 4475 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter) 4476 4477 // Get super_klass value into rax (even if it was in rdi or rcx). 4478 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false; 4479 if (super_klass != rax) { 4480 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; } 4481 mov(rax, super_klass); 4482 } 4483 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; } 4484 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; } 4485 4486 #ifndef PRODUCT 4487 int* pst_counter = &SharedRuntime::_partial_subtype_ctr; 4488 ExternalAddress pst_counter_addr((address) pst_counter); 4489 NOT_LP64( incrementl(pst_counter_addr) ); 4490 LP64_ONLY( lea(rcx, pst_counter_addr) ); 4491 LP64_ONLY( incrementl(Address(rcx, 0)) ); 4492 #endif //PRODUCT 4493 4494 // We will consult the secondary-super array. 4495 movptr(rdi, secondary_supers_addr); 4496 // Load the array length. (Positive movl does right thing on LP64.) 4497 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes())); 4498 // Skip to start of data. 4499 addptr(rdi, Array<Klass*>::base_offset_in_bytes()); 4500 4501 // Scan RCX words at [RDI] for an occurrence of RAX. 4502 // Set NZ/Z based on last compare. 4503 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does 4504 // not change flags (only scas instruction which is repeated sets flags). 4505 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. 4506 4507 testptr(rax,rax); // Set Z = 0 4508 repne_scan(); 4509 4510 // Unspill the temp. registers: 4511 if (pushed_rdi) pop(rdi); 4512 if (pushed_rcx) pop(rcx); 4513 if (pushed_rax) pop(rax); 4514 4515 if (set_cond_codes) { 4516 // Special hack for the AD files: rdi is guaranteed non-zero. 4517 assert(!pushed_rdi, "rdi must be left non-NULL"); 4518 // Also, the condition codes are properly set Z/NZ on succeed/failure. 4519 } 4520 4521 if (L_failure == &L_fallthrough) 4522 jccb(Assembler::notEqual, *L_failure); 4523 else jcc(Assembler::notEqual, *L_failure); 4524 4525 // Success. Cache the super we found and proceed in triumph. 4526 movptr(super_cache_addr, super_klass); 4527 4528 if (L_success != &L_fallthrough) { 4529 jmp(*L_success); 4530 } 4531 4532 #undef IS_A_TEMP 4533 4534 bind(L_fallthrough); 4535 } 4536 4537 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 4538 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 4539 4540 Label L_fallthrough; 4541 if (L_fast_path == NULL) { 4542 L_fast_path = &L_fallthrough; 4543 } else if (L_slow_path == NULL) { 4544 L_slow_path = &L_fallthrough; 4545 } 4546 4547 // Fast path check: class is fully initialized 4548 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized); 4549 jcc(Assembler::equal, *L_fast_path); 4550 4551 // Fast path check: current thread is initializer thread 4552 cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset())); 4553 if (L_slow_path == &L_fallthrough) { 4554 jcc(Assembler::equal, *L_fast_path); 4555 bind(*L_slow_path); 4556 } else if (L_fast_path == &L_fallthrough) { 4557 jcc(Assembler::notEqual, *L_slow_path); 4558 bind(*L_fast_path); 4559 } else { 4560 Unimplemented(); 4561 } 4562 } 4563 4564 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) { 4565 if (VM_Version::supports_cmov()) { 4566 cmovl(cc, dst, src); 4567 } else { 4568 Label L; 4569 jccb(negate_condition(cc), L); 4570 movl(dst, src); 4571 bind(L); 4572 } 4573 } 4574 4575 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) { 4576 if (VM_Version::supports_cmov()) { 4577 cmovl(cc, dst, src); 4578 } else { 4579 Label L; 4580 jccb(negate_condition(cc), L); 4581 movl(dst, src); 4582 bind(L); 4583 } 4584 } 4585 4586 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) { 4587 if (!VerifyOops) return; 4588 4589 BLOCK_COMMENT("verify_oop {"); 4590 #ifdef _LP64 4591 push(rscratch1); 4592 #endif 4593 push(rax); // save rax 4594 push(reg); // pass register argument 4595 4596 // Pass register number to verify_oop_subroutine 4597 const char* b = NULL; 4598 { 4599 ResourceMark rm; 4600 stringStream ss; 4601 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line); 4602 b = code_string(ss.as_string()); 4603 } 4604 ExternalAddress buffer((address) b); 4605 pushptr(buffer.addr(), rscratch1); 4606 4607 // call indirectly to solve generation ordering problem 4608 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 4609 call(rax); 4610 // Caller pops the arguments (oop, message) and restores rax, r10 4611 BLOCK_COMMENT("} verify_oop"); 4612 } 4613 4614 void MacroAssembler::vallones(XMMRegister dst, int vector_len) { 4615 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 4616 // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without 4617 // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog 4618 vpternlogd(dst, 0xFF, dst, dst, vector_len); 4619 } else if (VM_Version::supports_avx()) { 4620 vpcmpeqd(dst, dst, dst, vector_len); 4621 } else { 4622 assert(VM_Version::supports_sse2(), ""); 4623 pcmpeqd(dst, dst); 4624 } 4625 } 4626 4627 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 4628 int extra_slot_offset) { 4629 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 4630 int stackElementSize = Interpreter::stackElementSize; 4631 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 4632 #ifdef ASSERT 4633 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 4634 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 4635 #endif 4636 Register scale_reg = noreg; 4637 Address::ScaleFactor scale_factor = Address::no_scale; 4638 if (arg_slot.is_constant()) { 4639 offset += arg_slot.as_constant() * stackElementSize; 4640 } else { 4641 scale_reg = arg_slot.as_register(); 4642 scale_factor = Address::times(stackElementSize); 4643 } 4644 offset += wordSize; // return PC is on stack 4645 return Address(rsp, scale_reg, scale_factor, offset); 4646 } 4647 4648 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) { 4649 if (!VerifyOops) return; 4650 4651 #ifdef _LP64 4652 push(rscratch1); 4653 #endif 4654 push(rax); // save rax, 4655 // addr may contain rsp so we will have to adjust it based on the push 4656 // we just did (and on 64 bit we do two pushes) 4657 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which 4658 // stores rax into addr which is backwards of what was intended. 4659 if (addr.uses(rsp)) { 4660 lea(rax, addr); 4661 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord)); 4662 } else { 4663 pushptr(addr); 4664 } 4665 4666 // Pass register number to verify_oop_subroutine 4667 const char* b = NULL; 4668 { 4669 ResourceMark rm; 4670 stringStream ss; 4671 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line); 4672 b = code_string(ss.as_string()); 4673 } 4674 ExternalAddress buffer((address) b); 4675 pushptr(buffer.addr(), rscratch1); 4676 4677 // call indirectly to solve generation ordering problem 4678 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 4679 call(rax); 4680 // Caller pops the arguments (addr, message) and restores rax, r10. 4681 } 4682 4683 void MacroAssembler::verify_tlab() { 4684 #ifdef ASSERT 4685 if (UseTLAB && VerifyOops) { 4686 Label next, ok; 4687 Register t1 = rsi; 4688 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread); 4689 4690 push(t1); 4691 NOT_LP64(push(thread_reg)); 4692 NOT_LP64(get_thread(thread_reg)); 4693 4694 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 4695 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset()))); 4696 jcc(Assembler::aboveEqual, next); 4697 STOP("assert(top >= start)"); 4698 should_not_reach_here(); 4699 4700 bind(next); 4701 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset()))); 4702 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 4703 jcc(Assembler::aboveEqual, ok); 4704 STOP("assert(top <= end)"); 4705 should_not_reach_here(); 4706 4707 bind(ok); 4708 NOT_LP64(pop(thread_reg)); 4709 pop(t1); 4710 } 4711 #endif 4712 } 4713 4714 class ControlWord { 4715 public: 4716 int32_t _value; 4717 4718 int rounding_control() const { return (_value >> 10) & 3 ; } 4719 int precision_control() const { return (_value >> 8) & 3 ; } 4720 bool precision() const { return ((_value >> 5) & 1) != 0; } 4721 bool underflow() const { return ((_value >> 4) & 1) != 0; } 4722 bool overflow() const { return ((_value >> 3) & 1) != 0; } 4723 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 4724 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 4725 bool invalid() const { return ((_value >> 0) & 1) != 0; } 4726 4727 void print() const { 4728 // rounding control 4729 const char* rc; 4730 switch (rounding_control()) { 4731 case 0: rc = "round near"; break; 4732 case 1: rc = "round down"; break; 4733 case 2: rc = "round up "; break; 4734 case 3: rc = "chop "; break; 4735 default: 4736 rc = NULL; // silence compiler warnings 4737 fatal("Unknown rounding control: %d", rounding_control()); 4738 }; 4739 // precision control 4740 const char* pc; 4741 switch (precision_control()) { 4742 case 0: pc = "24 bits "; break; 4743 case 1: pc = "reserved"; break; 4744 case 2: pc = "53 bits "; break; 4745 case 3: pc = "64 bits "; break; 4746 default: 4747 pc = NULL; // silence compiler warnings 4748 fatal("Unknown precision control: %d", precision_control()); 4749 }; 4750 // flags 4751 char f[9]; 4752 f[0] = ' '; 4753 f[1] = ' '; 4754 f[2] = (precision ()) ? 'P' : 'p'; 4755 f[3] = (underflow ()) ? 'U' : 'u'; 4756 f[4] = (overflow ()) ? 'O' : 'o'; 4757 f[5] = (zero_divide ()) ? 'Z' : 'z'; 4758 f[6] = (denormalized()) ? 'D' : 'd'; 4759 f[7] = (invalid ()) ? 'I' : 'i'; 4760 f[8] = '\x0'; 4761 // output 4762 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc); 4763 } 4764 4765 }; 4766 4767 class StatusWord { 4768 public: 4769 int32_t _value; 4770 4771 bool busy() const { return ((_value >> 15) & 1) != 0; } 4772 bool C3() const { return ((_value >> 14) & 1) != 0; } 4773 bool C2() const { return ((_value >> 10) & 1) != 0; } 4774 bool C1() const { return ((_value >> 9) & 1) != 0; } 4775 bool C0() const { return ((_value >> 8) & 1) != 0; } 4776 int top() const { return (_value >> 11) & 7 ; } 4777 bool error_status() const { return ((_value >> 7) & 1) != 0; } 4778 bool stack_fault() const { return ((_value >> 6) & 1) != 0; } 4779 bool precision() const { return ((_value >> 5) & 1) != 0; } 4780 bool underflow() const { return ((_value >> 4) & 1) != 0; } 4781 bool overflow() const { return ((_value >> 3) & 1) != 0; } 4782 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 4783 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 4784 bool invalid() const { return ((_value >> 0) & 1) != 0; } 4785 4786 void print() const { 4787 // condition codes 4788 char c[5]; 4789 c[0] = (C3()) ? '3' : '-'; 4790 c[1] = (C2()) ? '2' : '-'; 4791 c[2] = (C1()) ? '1' : '-'; 4792 c[3] = (C0()) ? '0' : '-'; 4793 c[4] = '\x0'; 4794 // flags 4795 char f[9]; 4796 f[0] = (error_status()) ? 'E' : '-'; 4797 f[1] = (stack_fault ()) ? 'S' : '-'; 4798 f[2] = (precision ()) ? 'P' : '-'; 4799 f[3] = (underflow ()) ? 'U' : '-'; 4800 f[4] = (overflow ()) ? 'O' : '-'; 4801 f[5] = (zero_divide ()) ? 'Z' : '-'; 4802 f[6] = (denormalized()) ? 'D' : '-'; 4803 f[7] = (invalid ()) ? 'I' : '-'; 4804 f[8] = '\x0'; 4805 // output 4806 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top()); 4807 } 4808 4809 }; 4810 4811 class TagWord { 4812 public: 4813 int32_t _value; 4814 4815 int tag_at(int i) const { return (_value >> (i*2)) & 3; } 4816 4817 void print() const { 4818 printf("%04x", _value & 0xFFFF); 4819 } 4820 4821 }; 4822 4823 class FPU_Register { 4824 public: 4825 int32_t _m0; 4826 int32_t _m1; 4827 int16_t _ex; 4828 4829 bool is_indefinite() const { 4830 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0; 4831 } 4832 4833 void print() const { 4834 char sign = (_ex < 0) ? '-' : '+'; 4835 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " "; 4836 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind); 4837 }; 4838 4839 }; 4840 4841 class FPU_State { 4842 public: 4843 enum { 4844 register_size = 10, 4845 number_of_registers = 8, 4846 register_mask = 7 4847 }; 4848 4849 ControlWord _control_word; 4850 StatusWord _status_word; 4851 TagWord _tag_word; 4852 int32_t _error_offset; 4853 int32_t _error_selector; 4854 int32_t _data_offset; 4855 int32_t _data_selector; 4856 int8_t _register[register_size * number_of_registers]; 4857 4858 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); } 4859 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; } 4860 4861 const char* tag_as_string(int tag) const { 4862 switch (tag) { 4863 case 0: return "valid"; 4864 case 1: return "zero"; 4865 case 2: return "special"; 4866 case 3: return "empty"; 4867 } 4868 ShouldNotReachHere(); 4869 return NULL; 4870 } 4871 4872 void print() const { 4873 // print computation registers 4874 { int t = _status_word.top(); 4875 for (int i = 0; i < number_of_registers; i++) { 4876 int j = (i - t) & register_mask; 4877 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j); 4878 st(j)->print(); 4879 printf(" %s\n", tag_as_string(_tag_word.tag_at(i))); 4880 } 4881 } 4882 printf("\n"); 4883 // print control registers 4884 printf("ctrl = "); _control_word.print(); printf("\n"); 4885 printf("stat = "); _status_word .print(); printf("\n"); 4886 printf("tags = "); _tag_word .print(); printf("\n"); 4887 } 4888 4889 }; 4890 4891 class Flag_Register { 4892 public: 4893 int32_t _value; 4894 4895 bool overflow() const { return ((_value >> 11) & 1) != 0; } 4896 bool direction() const { return ((_value >> 10) & 1) != 0; } 4897 bool sign() const { return ((_value >> 7) & 1) != 0; } 4898 bool zero() const { return ((_value >> 6) & 1) != 0; } 4899 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; } 4900 bool parity() const { return ((_value >> 2) & 1) != 0; } 4901 bool carry() const { return ((_value >> 0) & 1) != 0; } 4902 4903 void print() const { 4904 // flags 4905 char f[8]; 4906 f[0] = (overflow ()) ? 'O' : '-'; 4907 f[1] = (direction ()) ? 'D' : '-'; 4908 f[2] = (sign ()) ? 'S' : '-'; 4909 f[3] = (zero ()) ? 'Z' : '-'; 4910 f[4] = (auxiliary_carry()) ? 'A' : '-'; 4911 f[5] = (parity ()) ? 'P' : '-'; 4912 f[6] = (carry ()) ? 'C' : '-'; 4913 f[7] = '\x0'; 4914 // output 4915 printf("%08x flags = %s", _value, f); 4916 } 4917 4918 }; 4919 4920 class IU_Register { 4921 public: 4922 int32_t _value; 4923 4924 void print() const { 4925 printf("%08x %11d", _value, _value); 4926 } 4927 4928 }; 4929 4930 class IU_State { 4931 public: 4932 Flag_Register _eflags; 4933 IU_Register _rdi; 4934 IU_Register _rsi; 4935 IU_Register _rbp; 4936 IU_Register _rsp; 4937 IU_Register _rbx; 4938 IU_Register _rdx; 4939 IU_Register _rcx; 4940 IU_Register _rax; 4941 4942 void print() const { 4943 // computation registers 4944 printf("rax, = "); _rax.print(); printf("\n"); 4945 printf("rbx, = "); _rbx.print(); printf("\n"); 4946 printf("rcx = "); _rcx.print(); printf("\n"); 4947 printf("rdx = "); _rdx.print(); printf("\n"); 4948 printf("rdi = "); _rdi.print(); printf("\n"); 4949 printf("rsi = "); _rsi.print(); printf("\n"); 4950 printf("rbp, = "); _rbp.print(); printf("\n"); 4951 printf("rsp = "); _rsp.print(); printf("\n"); 4952 printf("\n"); 4953 // control registers 4954 printf("flgs = "); _eflags.print(); printf("\n"); 4955 } 4956 }; 4957 4958 4959 class CPU_State { 4960 public: 4961 FPU_State _fpu_state; 4962 IU_State _iu_state; 4963 4964 void print() const { 4965 printf("--------------------------------------------------\n"); 4966 _iu_state .print(); 4967 printf("\n"); 4968 _fpu_state.print(); 4969 printf("--------------------------------------------------\n"); 4970 } 4971 4972 }; 4973 4974 4975 static void _print_CPU_state(CPU_State* state) { 4976 state->print(); 4977 }; 4978 4979 4980 void MacroAssembler::print_CPU_state() { 4981 push_CPU_state(); 4982 push(rsp); // pass CPU state 4983 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state))); 4984 addptr(rsp, wordSize); // discard argument 4985 pop_CPU_state(); 4986 } 4987 4988 4989 #ifndef _LP64 4990 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) { 4991 static int counter = 0; 4992 FPU_State* fs = &state->_fpu_state; 4993 counter++; 4994 // For leaf calls, only verify that the top few elements remain empty. 4995 // We only need 1 empty at the top for C2 code. 4996 if( stack_depth < 0 ) { 4997 if( fs->tag_for_st(7) != 3 ) { 4998 printf("FPR7 not empty\n"); 4999 state->print(); 5000 assert(false, "error"); 5001 return false; 5002 } 5003 return true; // All other stack states do not matter 5004 } 5005 5006 assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(), 5007 "bad FPU control word"); 5008 5009 // compute stack depth 5010 int i = 0; 5011 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++; 5012 int d = i; 5013 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++; 5014 // verify findings 5015 if (i != FPU_State::number_of_registers) { 5016 // stack not contiguous 5017 printf("%s: stack not contiguous at ST%d\n", s, i); 5018 state->print(); 5019 assert(false, "error"); 5020 return false; 5021 } 5022 // check if computed stack depth corresponds to expected stack depth 5023 if (stack_depth < 0) { 5024 // expected stack depth is -stack_depth or less 5025 if (d > -stack_depth) { 5026 // too many elements on the stack 5027 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d); 5028 state->print(); 5029 assert(false, "error"); 5030 return false; 5031 } 5032 } else { 5033 // expected stack depth is stack_depth 5034 if (d != stack_depth) { 5035 // wrong stack depth 5036 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d); 5037 state->print(); 5038 assert(false, "error"); 5039 return false; 5040 } 5041 } 5042 // everything is cool 5043 return true; 5044 } 5045 5046 void MacroAssembler::verify_FPU(int stack_depth, const char* s) { 5047 if (!VerifyFPU) return; 5048 push_CPU_state(); 5049 push(rsp); // pass CPU state 5050 ExternalAddress msg((address) s); 5051 // pass message string s 5052 pushptr(msg.addr(), noreg); 5053 push(stack_depth); // pass stack depth 5054 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU))); 5055 addptr(rsp, 3 * wordSize); // discard arguments 5056 // check for error 5057 { Label L; 5058 testl(rax, rax); 5059 jcc(Assembler::notZero, L); 5060 int3(); // break if error condition 5061 bind(L); 5062 } 5063 pop_CPU_state(); 5064 } 5065 #endif // _LP64 5066 5067 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) { 5068 // Either restore the MXCSR register after returning from the JNI Call 5069 // or verify that it wasn't changed (with -Xcheck:jni flag). 5070 if (VM_Version::supports_sse()) { 5071 if (RestoreMXCSROnJNICalls) { 5072 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch); 5073 } else if (CheckJNICalls) { 5074 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); 5075 } 5076 } 5077 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. 5078 vzeroupper(); 5079 5080 #ifndef _LP64 5081 // Either restore the x87 floating pointer control word after returning 5082 // from the JNI call or verify that it wasn't changed. 5083 if (CheckJNICalls) { 5084 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); 5085 } 5086 #endif // _LP64 5087 } 5088 5089 // ((OopHandle)result).resolve(); 5090 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 5091 assert_different_registers(result, tmp); 5092 5093 // Only 64 bit platforms support GCs that require a tmp register 5094 // Only IN_HEAP loads require a thread_tmp register 5095 // OopHandle::resolve is an indirection like jobject. 5096 access_load_at(T_OBJECT, IN_NATIVE, 5097 result, Address(result, 0), tmp, /*tmp_thread*/noreg); 5098 } 5099 5100 // ((WeakHandle)result).resolve(); 5101 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) { 5102 assert_different_registers(rresult, rtmp); 5103 Label resolved; 5104 5105 // A null weak handle resolves to null. 5106 cmpptr(rresult, 0); 5107 jcc(Assembler::equal, resolved); 5108 5109 // Only 64 bit platforms support GCs that require a tmp register 5110 // Only IN_HEAP loads require a thread_tmp register 5111 // WeakHandle::resolve is an indirection like jweak. 5112 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 5113 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg); 5114 bind(resolved); 5115 } 5116 5117 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) { 5118 // get mirror 5119 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 5120 load_method_holder(mirror, method); 5121 movptr(mirror, Address(mirror, mirror_offset)); 5122 resolve_oop_handle(mirror, tmp); 5123 } 5124 5125 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) { 5126 load_method_holder(rresult, rmethod); 5127 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset())); 5128 } 5129 5130 void MacroAssembler::load_method_holder(Register holder, Register method) { 5131 movptr(holder, Address(method, Method::const_offset())); // ConstMethod* 5132 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool* 5133 movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass* 5134 } 5135 5136 #ifdef _LP64 5137 void MacroAssembler::load_nklass(Register dst, Register src) { 5138 assert(UseCompressedClassPointers, "expect compressed class pointers"); 5139 5140 if (!UseCompactObjectHeaders) { 5141 movl(dst, Address(src, oopDesc::klass_offset_in_bytes())); 5142 return; 5143 } 5144 5145 Label fast; 5146 movq(dst, Address(src, oopDesc::mark_offset_in_bytes())); 5147 testb(dst, markWord::monitor_value); 5148 jccb(Assembler::zero, fast); 5149 5150 // Fetch displaced header 5151 movq(dst, Address(dst, OM_OFFSET_NO_MONITOR_VALUE_TAG(header))); 5152 5153 bind(fast); 5154 shrq(dst, markWord::klass_shift); 5155 } 5156 #endif 5157 5158 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) { 5159 assert_different_registers(src, tmp); 5160 assert_different_registers(dst, tmp); 5161 #ifdef _LP64 5162 if (UseCompressedClassPointers) { 5163 load_nklass(dst, src); 5164 decode_klass_not_null(dst, tmp); 5165 } else 5166 #endif 5167 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 5168 } 5169 5170 void MacroAssembler::load_klass_check_null(Register dst, Register src, Register tmp) { 5171 if (UseCompactObjectHeaders) { 5172 null_check(src, oopDesc::mark_offset_in_bytes()); 5173 } else { 5174 null_check(src, oopDesc::klass_offset_in_bytes()); 5175 } 5176 load_klass(dst, src, tmp); 5177 } 5178 5179 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) { 5180 assert(!UseCompactObjectHeaders, "not with compact headers"); 5181 assert_different_registers(src, tmp); 5182 assert_different_registers(dst, tmp); 5183 #ifdef _LP64 5184 if (UseCompressedClassPointers) { 5185 encode_klass_not_null(src, tmp); 5186 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src); 5187 } else 5188 #endif 5189 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src); 5190 } 5191 5192 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) { 5193 #ifdef _LP64 5194 if (UseCompactObjectHeaders) { 5195 // NOTE: We need to deal with possible ObjectMonitor in object header. 5196 // Eventually we might be able to do simple movl & cmpl like in 5197 // the CCP path below. 5198 load_nklass(tmp, obj); 5199 cmpl(klass, tmp); 5200 } else if (UseCompressedClassPointers) { 5201 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes())); 5202 } else 5203 #endif 5204 { 5205 cmpptr(klass, Address(obj, oopDesc::klass_offset_in_bytes())); 5206 } 5207 } 5208 5209 void MacroAssembler::cmp_klass(Register src, Register dst, Register tmp1, Register tmp2) { 5210 #ifdef _LP64 5211 if (UseCompactObjectHeaders) { 5212 // NOTE: We need to deal with possible ObjectMonitor in object header. 5213 // Eventually we might be able to do simple movl & cmpl like in 5214 // the CCP path below. 5215 assert(tmp2 != noreg, "need tmp2"); 5216 assert_different_registers(src, dst, tmp1, tmp2); 5217 load_nklass(tmp1, src); 5218 load_nklass(tmp2, dst); 5219 cmpl(tmp1, tmp2); 5220 } else if (UseCompressedClassPointers) { 5221 movl(tmp1, Address(src, oopDesc::klass_offset_in_bytes())); 5222 cmpl(tmp1, Address(dst, oopDesc::klass_offset_in_bytes())); 5223 } else 5224 #endif 5225 { 5226 movptr(tmp1, Address(src, oopDesc::klass_offset_in_bytes())); 5227 cmpptr(tmp1, Address(dst, oopDesc::klass_offset_in_bytes())); 5228 } 5229 } 5230 5231 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src, 5232 Register tmp1, Register thread_tmp) { 5233 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 5234 decorators = AccessInternal::decorator_fixup(decorators, type); 5235 bool as_raw = (decorators & AS_RAW) != 0; 5236 if (as_raw) { 5237 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 5238 } else { 5239 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 5240 } 5241 } 5242 5243 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val, 5244 Register tmp1, Register tmp2, Register tmp3) { 5245 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 5246 decorators = AccessInternal::decorator_fixup(decorators, type); 5247 bool as_raw = (decorators & AS_RAW) != 0; 5248 if (as_raw) { 5249 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3); 5250 } else { 5251 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3); 5252 } 5253 } 5254 5255 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 5256 Register thread_tmp, DecoratorSet decorators) { 5257 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 5258 } 5259 5260 // Doesn't do verification, generates fixed size code 5261 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 5262 Register thread_tmp, DecoratorSet decorators) { 5263 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 5264 } 5265 5266 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1, 5267 Register tmp2, Register tmp3, DecoratorSet decorators) { 5268 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3); 5269 } 5270 5271 // Used for storing NULLs. 5272 void MacroAssembler::store_heap_oop_null(Address dst) { 5273 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg); 5274 } 5275 5276 #ifdef _LP64 5277 void MacroAssembler::store_klass_gap(Register dst, Register src) { 5278 if (UseCompressedClassPointers) { 5279 // Store to klass gap in destination 5280 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src); 5281 } 5282 } 5283 5284 #ifdef ASSERT 5285 void MacroAssembler::verify_heapbase(const char* msg) { 5286 assert (UseCompressedOops, "should be compressed"); 5287 assert (Universe::heap() != NULL, "java heap should be initialized"); 5288 if (CheckCompressedOops) { 5289 Label ok; 5290 ExternalAddress src2(CompressedOops::ptrs_base_addr()); 5291 const bool is_src2_reachable = reachable(src2); 5292 if (!is_src2_reachable) { 5293 push(rscratch1); // cmpptr trashes rscratch1 5294 } 5295 cmpptr(r12_heapbase, src2, rscratch1); 5296 jcc(Assembler::equal, ok); 5297 STOP(msg); 5298 bind(ok); 5299 if (!is_src2_reachable) { 5300 pop(rscratch1); 5301 } 5302 } 5303 } 5304 #endif 5305 5306 // Algorithm must match oop.inline.hpp encode_heap_oop. 5307 void MacroAssembler::encode_heap_oop(Register r) { 5308 #ifdef ASSERT 5309 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 5310 #endif 5311 verify_oop_msg(r, "broken oop in encode_heap_oop"); 5312 if (CompressedOops::base() == NULL) { 5313 if (CompressedOops::shift() != 0) { 5314 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5315 shrq(r, LogMinObjAlignmentInBytes); 5316 } 5317 return; 5318 } 5319 testq(r, r); 5320 cmovq(Assembler::equal, r, r12_heapbase); 5321 subq(r, r12_heapbase); 5322 shrq(r, LogMinObjAlignmentInBytes); 5323 } 5324 5325 void MacroAssembler::encode_heap_oop_not_null(Register r) { 5326 #ifdef ASSERT 5327 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 5328 if (CheckCompressedOops) { 5329 Label ok; 5330 testq(r, r); 5331 jcc(Assembler::notEqual, ok); 5332 STOP("null oop passed to encode_heap_oop_not_null"); 5333 bind(ok); 5334 } 5335 #endif 5336 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null"); 5337 if (CompressedOops::base() != NULL) { 5338 subq(r, r12_heapbase); 5339 } 5340 if (CompressedOops::shift() != 0) { 5341 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5342 shrq(r, LogMinObjAlignmentInBytes); 5343 } 5344 } 5345 5346 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 5347 #ifdef ASSERT 5348 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 5349 if (CheckCompressedOops) { 5350 Label ok; 5351 testq(src, src); 5352 jcc(Assembler::notEqual, ok); 5353 STOP("null oop passed to encode_heap_oop_not_null2"); 5354 bind(ok); 5355 } 5356 #endif 5357 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2"); 5358 if (dst != src) { 5359 movq(dst, src); 5360 } 5361 if (CompressedOops::base() != NULL) { 5362 subq(dst, r12_heapbase); 5363 } 5364 if (CompressedOops::shift() != 0) { 5365 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5366 shrq(dst, LogMinObjAlignmentInBytes); 5367 } 5368 } 5369 5370 void MacroAssembler::decode_heap_oop(Register r) { 5371 #ifdef ASSERT 5372 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 5373 #endif 5374 if (CompressedOops::base() == NULL) { 5375 if (CompressedOops::shift() != 0) { 5376 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5377 shlq(r, LogMinObjAlignmentInBytes); 5378 } 5379 } else { 5380 Label done; 5381 shlq(r, LogMinObjAlignmentInBytes); 5382 jccb(Assembler::equal, done); 5383 addq(r, r12_heapbase); 5384 bind(done); 5385 } 5386 verify_oop_msg(r, "broken oop in decode_heap_oop"); 5387 } 5388 5389 void MacroAssembler::decode_heap_oop_not_null(Register r) { 5390 // Note: it will change flags 5391 assert (UseCompressedOops, "should only be used for compressed headers"); 5392 assert (Universe::heap() != NULL, "java heap should be initialized"); 5393 // Cannot assert, unverified entry point counts instructions (see .ad file) 5394 // vtableStubs also counts instructions in pd_code_size_limit. 5395 // Also do not verify_oop as this is called by verify_oop. 5396 if (CompressedOops::shift() != 0) { 5397 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5398 shlq(r, LogMinObjAlignmentInBytes); 5399 if (CompressedOops::base() != NULL) { 5400 addq(r, r12_heapbase); 5401 } 5402 } else { 5403 assert (CompressedOops::base() == NULL, "sanity"); 5404 } 5405 } 5406 5407 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 5408 // Note: it will change flags 5409 assert (UseCompressedOops, "should only be used for compressed headers"); 5410 assert (Universe::heap() != NULL, "java heap should be initialized"); 5411 // Cannot assert, unverified entry point counts instructions (see .ad file) 5412 // vtableStubs also counts instructions in pd_code_size_limit. 5413 // Also do not verify_oop as this is called by verify_oop. 5414 if (CompressedOops::shift() != 0) { 5415 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 5416 if (LogMinObjAlignmentInBytes == Address::times_8) { 5417 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0)); 5418 } else { 5419 if (dst != src) { 5420 movq(dst, src); 5421 } 5422 shlq(dst, LogMinObjAlignmentInBytes); 5423 if (CompressedOops::base() != NULL) { 5424 addq(dst, r12_heapbase); 5425 } 5426 } 5427 } else { 5428 assert (CompressedOops::base() == NULL, "sanity"); 5429 if (dst != src) { 5430 movq(dst, src); 5431 } 5432 } 5433 } 5434 5435 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode = KlassDecodeNone; 5436 5437 // Returns a static string 5438 const char* MacroAssembler::describe_klass_decode_mode(MacroAssembler::KlassDecodeMode mode) { 5439 switch (mode) { 5440 case KlassDecodeNone: return "none"; 5441 case KlassDecodeZero: return "zero"; 5442 case KlassDecodeXor: return "xor"; 5443 case KlassDecodeAdd: return "add"; 5444 default: 5445 ShouldNotReachHere(); 5446 } 5447 return NULL; 5448 } 5449 5450 // Return the current narrow Klass pointer decode mode. 5451 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() { 5452 if (_klass_decode_mode == KlassDecodeNone) { 5453 // First time initialization 5454 assert(UseCompressedClassPointers, "not using compressed class pointers"); 5455 assert(Metaspace::initialized(), "metaspace not initialized yet"); 5456 5457 _klass_decode_mode = klass_decode_mode_for_base(CompressedKlassPointers::base()); 5458 guarantee(_klass_decode_mode != KlassDecodeNone, 5459 PTR_FORMAT " is not a valid encoding base on aarch64", 5460 p2i(CompressedKlassPointers::base())); 5461 log_info(metaspace)("klass decode mode initialized: %s", describe_klass_decode_mode(_klass_decode_mode)); 5462 } 5463 return _klass_decode_mode; 5464 } 5465 5466 // Given an arbitrary base address, return the KlassDecodeMode that would be used. Return KlassDecodeNone 5467 // if base address is not valid for encoding. 5468 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode_for_base(address base) { 5469 assert(CompressedKlassPointers::shift() != 0, "not lilliput?"); 5470 5471 const uint64_t base_u64 = (uint64_t) base; 5472 5473 if (base_u64 == 0) { 5474 return KlassDecodeZero; 5475 } 5476 5477 if ((base_u64 & (KlassEncodingMetaspaceMax - 1)) == 0) { 5478 return KlassDecodeXor; 5479 } 5480 5481 // Note that there is no point in optimizing for shift=3 since lilliput 5482 // will use larger shifts 5483 5484 // The add+shift mode for decode_and_move_klass_not_null() requires the base to be 5485 // shiftable-without-loss. So, this is the minimum restriction on x64 for a valid 5486 // encoding base. This does not matter in reality since the shift values we use for 5487 // Lilliput, while large, won't be larger than a page size. And the encoding base 5488 // will be quite likely page aligned since it usually falls to the beginning of 5489 // either CDS or CCS. 5490 if ((base_u64 & (KlassAlignmentInBytes - 1)) == 0) { 5491 return KlassDecodeAdd; 5492 } 5493 5494 return KlassDecodeNone; 5495 } 5496 5497 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) { 5498 assert_different_registers(r, tmp); 5499 switch (klass_decode_mode()) { 5500 case KlassDecodeZero: { 5501 shrq(r, CompressedKlassPointers::shift()); 5502 break; 5503 } 5504 case KlassDecodeXor: { 5505 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 5506 xorq(r, tmp); 5507 shrq(r, CompressedKlassPointers::shift()); 5508 break; 5509 } 5510 case KlassDecodeAdd: { 5511 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 5512 subq(r, tmp); 5513 shrq(r, CompressedKlassPointers::shift()); 5514 break; 5515 } 5516 default: 5517 ShouldNotReachHere(); 5518 } 5519 } 5520 5521 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) { 5522 assert_different_registers(src, dst); 5523 switch (klass_decode_mode()) { 5524 case KlassDecodeZero: { 5525 movptr(dst, src); 5526 shrq(dst, CompressedKlassPointers::shift()); 5527 break; 5528 } 5529 case KlassDecodeXor: { 5530 mov64(dst, (int64_t)CompressedKlassPointers::base()); 5531 xorq(dst, src); 5532 shrq(dst, CompressedKlassPointers::shift()); 5533 break; 5534 } 5535 case KlassDecodeAdd: { 5536 mov64(dst, -(int64_t)CompressedKlassPointers::base()); 5537 addq(dst, src); 5538 shrq(dst, CompressedKlassPointers::shift()); 5539 break; 5540 } 5541 default: 5542 ShouldNotReachHere(); 5543 } 5544 } 5545 5546 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) { 5547 assert_different_registers(r, tmp); 5548 const uint64_t base_u64 = (uint64_t)CompressedKlassPointers::base(); 5549 switch (klass_decode_mode()) { 5550 case KlassDecodeZero: { 5551 shlq(r, CompressedKlassPointers::shift()); 5552 break; 5553 } 5554 case KlassDecodeXor: { 5555 assert((base_u64 & (KlassEncodingMetaspaceMax - 1)) == 0, 5556 "base " UINT64_FORMAT_X " invalid for xor mode", base_u64); // should have been handled at VM init. 5557 shlq(r, CompressedKlassPointers::shift()); 5558 mov64(tmp, base_u64); 5559 xorq(r, tmp); 5560 break; 5561 } 5562 case KlassDecodeAdd: { 5563 shlq(r, CompressedKlassPointers::shift()); 5564 mov64(tmp, base_u64); 5565 addq(r, tmp); 5566 break; 5567 } 5568 default: 5569 ShouldNotReachHere(); 5570 } 5571 } 5572 5573 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) { 5574 assert_different_registers(src, dst); 5575 // Note: Cannot assert, unverified entry point counts instructions (see .ad file) 5576 // vtableStubs also counts instructions in pd_code_size_limit. 5577 // Also do not verify_oop as this is called by verify_oop. 5578 5579 const uint64_t base_u64 = (uint64_t)CompressedKlassPointers::base(); 5580 5581 switch (klass_decode_mode()) { 5582 case KlassDecodeZero: { 5583 movq(dst, src); 5584 shlq(dst, CompressedKlassPointers::shift()); 5585 break; 5586 } 5587 case KlassDecodeXor: { 5588 assert((base_u64 & (KlassEncodingMetaspaceMax - 1)) == 0, 5589 "base " UINT64_FORMAT_X " invalid for xor mode", base_u64); // should have been handled at VM init. 5590 const uint64_t base_right_shifted = base_u64 >> CompressedKlassPointers::shift(); 5591 mov64(dst, base_right_shifted); 5592 xorq(dst, src); 5593 shlq(dst, CompressedKlassPointers::shift()); 5594 break; 5595 } 5596 case KlassDecodeAdd: { 5597 assert((base_u64 & (KlassAlignmentInBytes - 1)) == 0, 5598 "base " UINT64_FORMAT_X " invalid for add mode", base_u64); // should have been handled at VM init. 5599 const uint64_t base_right_shifted = base_u64 >> CompressedKlassPointers::shift(); 5600 mov64(dst, base_right_shifted); 5601 addq(dst, src); 5602 shlq(dst, CompressedKlassPointers::shift()); 5603 break; 5604 } 5605 default: 5606 ShouldNotReachHere(); 5607 } 5608 } 5609 5610 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 5611 assert (UseCompressedOops, "should only be used for compressed headers"); 5612 assert (Universe::heap() != NULL, "java heap should be initialized"); 5613 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5614 int oop_index = oop_recorder()->find_index(obj); 5615 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5616 mov_narrow_oop(dst, oop_index, rspec); 5617 } 5618 5619 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) { 5620 assert (UseCompressedOops, "should only be used for compressed headers"); 5621 assert (Universe::heap() != NULL, "java heap should be initialized"); 5622 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5623 int oop_index = oop_recorder()->find_index(obj); 5624 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5625 mov_narrow_oop(dst, oop_index, rspec); 5626 } 5627 5628 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 5629 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5630 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5631 int klass_index = oop_recorder()->find_index(k); 5632 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 5633 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 5634 } 5635 5636 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) { 5637 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5638 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5639 int klass_index = oop_recorder()->find_index(k); 5640 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 5641 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 5642 } 5643 5644 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) { 5645 assert (UseCompressedOops, "should only be used for compressed headers"); 5646 assert (Universe::heap() != NULL, "java heap should be initialized"); 5647 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5648 int oop_index = oop_recorder()->find_index(obj); 5649 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5650 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 5651 } 5652 5653 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) { 5654 assert (UseCompressedOops, "should only be used for compressed headers"); 5655 assert (Universe::heap() != NULL, "java heap should be initialized"); 5656 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5657 int oop_index = oop_recorder()->find_index(obj); 5658 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5659 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 5660 } 5661 5662 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) { 5663 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5664 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5665 int klass_index = oop_recorder()->find_index(k); 5666 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 5667 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 5668 } 5669 5670 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) { 5671 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5672 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5673 int klass_index = oop_recorder()->find_index(k); 5674 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 5675 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 5676 } 5677 5678 void MacroAssembler::reinit_heapbase() { 5679 if (UseCompressedOops) { 5680 if (Universe::heap() != NULL) { 5681 if (CompressedOops::base() == NULL) { 5682 MacroAssembler::xorptr(r12_heapbase, r12_heapbase); 5683 } else { 5684 mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base()); 5685 } 5686 } else { 5687 movptr(r12_heapbase, ExternalAddress(CompressedOops::ptrs_base_addr())); 5688 } 5689 } 5690 } 5691 5692 #endif // _LP64 5693 5694 #if COMPILER2_OR_JVMCI 5695 5696 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers 5697 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) { 5698 // cnt - number of qwords (8-byte words). 5699 // base - start address, qword aligned. 5700 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end; 5701 bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0); 5702 if (use64byteVector) { 5703 vpxor(xtmp, xtmp, xtmp, AVX_512bit); 5704 } else if (MaxVectorSize >= 32) { 5705 vpxor(xtmp, xtmp, xtmp, AVX_256bit); 5706 } else { 5707 pxor(xtmp, xtmp); 5708 } 5709 jmp(L_zero_64_bytes); 5710 5711 BIND(L_loop); 5712 if (MaxVectorSize >= 32) { 5713 fill64(base, 0, xtmp, use64byteVector); 5714 } else { 5715 movdqu(Address(base, 0), xtmp); 5716 movdqu(Address(base, 16), xtmp); 5717 movdqu(Address(base, 32), xtmp); 5718 movdqu(Address(base, 48), xtmp); 5719 } 5720 addptr(base, 64); 5721 5722 BIND(L_zero_64_bytes); 5723 subptr(cnt, 8); 5724 jccb(Assembler::greaterEqual, L_loop); 5725 5726 // Copy trailing 64 bytes 5727 if (use64byteVector) { 5728 addptr(cnt, 8); 5729 jccb(Assembler::equal, L_end); 5730 fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true); 5731 jmp(L_end); 5732 } else { 5733 addptr(cnt, 4); 5734 jccb(Assembler::less, L_tail); 5735 if (MaxVectorSize >= 32) { 5736 vmovdqu(Address(base, 0), xtmp); 5737 } else { 5738 movdqu(Address(base, 0), xtmp); 5739 movdqu(Address(base, 16), xtmp); 5740 } 5741 } 5742 addptr(base, 32); 5743 subptr(cnt, 4); 5744 5745 BIND(L_tail); 5746 addptr(cnt, 4); 5747 jccb(Assembler::lessEqual, L_end); 5748 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) { 5749 fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp); 5750 } else { 5751 decrement(cnt); 5752 5753 BIND(L_sloop); 5754 movq(Address(base, 0), xtmp); 5755 addptr(base, 8); 5756 decrement(cnt); 5757 jccb(Assembler::greaterEqual, L_sloop); 5758 } 5759 BIND(L_end); 5760 } 5761 5762 // Clearing constant sized memory using YMM/ZMM registers. 5763 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) { 5764 assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), ""); 5765 bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); 5766 5767 int vector64_count = (cnt & (~0x7)) >> 3; 5768 cnt = cnt & 0x7; 5769 const int fill64_per_loop = 4; 5770 const int max_unrolled_fill64 = 8; 5771 5772 // 64 byte initialization loop. 5773 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit); 5774 int start64 = 0; 5775 if (vector64_count > max_unrolled_fill64) { 5776 Label LOOP; 5777 Register index = rtmp; 5778 5779 start64 = vector64_count - (vector64_count % fill64_per_loop); 5780 5781 movl(index, 0); 5782 BIND(LOOP); 5783 for (int i = 0; i < fill64_per_loop; i++) { 5784 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector); 5785 } 5786 addl(index, fill64_per_loop * 64); 5787 cmpl(index, start64 * 64); 5788 jccb(Assembler::less, LOOP); 5789 } 5790 for (int i = start64; i < vector64_count; i++) { 5791 fill64(base, i * 64, xtmp, use64byteVector); 5792 } 5793 5794 // Clear remaining 64 byte tail. 5795 int disp = vector64_count * 64; 5796 if (cnt) { 5797 switch (cnt) { 5798 case 1: 5799 movq(Address(base, disp), xtmp); 5800 break; 5801 case 2: 5802 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit); 5803 break; 5804 case 3: 5805 movl(rtmp, 0x7); 5806 kmovwl(mask, rtmp); 5807 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit); 5808 break; 5809 case 4: 5810 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 5811 break; 5812 case 5: 5813 if (use64byteVector) { 5814 movl(rtmp, 0x1F); 5815 kmovwl(mask, rtmp); 5816 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit); 5817 } else { 5818 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 5819 movq(Address(base, disp + 32), xtmp); 5820 } 5821 break; 5822 case 6: 5823 if (use64byteVector) { 5824 movl(rtmp, 0x3F); 5825 kmovwl(mask, rtmp); 5826 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit); 5827 } else { 5828 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 5829 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit); 5830 } 5831 break; 5832 case 7: 5833 if (use64byteVector) { 5834 movl(rtmp, 0x7F); 5835 kmovwl(mask, rtmp); 5836 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit); 5837 } else { 5838 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 5839 movl(rtmp, 0x7); 5840 kmovwl(mask, rtmp); 5841 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit); 5842 } 5843 break; 5844 default: 5845 fatal("Unexpected length : %d\n",cnt); 5846 break; 5847 } 5848 } 5849 } 5850 5851 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, 5852 bool is_large, KRegister mask) { 5853 // cnt - number of qwords (8-byte words). 5854 // base - start address, qword aligned. 5855 // is_large - if optimizers know cnt is larger than InitArrayShortSize 5856 assert(base==rdi, "base register must be edi for rep stos"); 5857 assert(tmp==rax, "tmp register must be eax for rep stos"); 5858 assert(cnt==rcx, "cnt register must be ecx for rep stos"); 5859 assert(InitArrayShortSize % BytesPerLong == 0, 5860 "InitArrayShortSize should be the multiple of BytesPerLong"); 5861 5862 Label DONE; 5863 if (!is_large || !UseXMMForObjInit) { 5864 xorptr(tmp, tmp); 5865 } 5866 5867 if (!is_large) { 5868 Label LOOP, LONG; 5869 cmpptr(cnt, InitArrayShortSize/BytesPerLong); 5870 jccb(Assembler::greater, LONG); 5871 5872 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 5873 5874 decrement(cnt); 5875 jccb(Assembler::negative, DONE); // Zero length 5876 5877 // Use individual pointer-sized stores for small counts: 5878 BIND(LOOP); 5879 movptr(Address(base, cnt, Address::times_ptr), tmp); 5880 decrement(cnt); 5881 jccb(Assembler::greaterEqual, LOOP); 5882 jmpb(DONE); 5883 5884 BIND(LONG); 5885 } 5886 5887 // Use longer rep-prefixed ops for non-small counts: 5888 if (UseFastStosb) { 5889 shlptr(cnt, 3); // convert to number of bytes 5890 rep_stosb(); 5891 } else if (UseXMMForObjInit) { 5892 xmm_clear_mem(base, cnt, tmp, xtmp, mask); 5893 } else { 5894 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 5895 rep_stos(); 5896 } 5897 5898 BIND(DONE); 5899 } 5900 5901 #endif //COMPILER2_OR_JVMCI 5902 5903 5904 void MacroAssembler::generate_fill(BasicType t, bool aligned, 5905 Register to, Register value, Register count, 5906 Register rtmp, XMMRegister xtmp) { 5907 ShortBranchVerifier sbv(this); 5908 assert_different_registers(to, value, count, rtmp); 5909 Label L_exit; 5910 Label L_fill_2_bytes, L_fill_4_bytes; 5911 5912 #if defined(COMPILER2) && defined(_LP64) 5913 if(MaxVectorSize >=32 && 5914 VM_Version::supports_avx512vlbw() && 5915 VM_Version::supports_bmi2()) { 5916 generate_fill_avx3(t, to, value, count, rtmp, xtmp); 5917 return; 5918 } 5919 #endif 5920 5921 int shift = -1; 5922 switch (t) { 5923 case T_BYTE: 5924 shift = 2; 5925 break; 5926 case T_SHORT: 5927 shift = 1; 5928 break; 5929 case T_INT: 5930 shift = 0; 5931 break; 5932 default: ShouldNotReachHere(); 5933 } 5934 5935 if (t == T_BYTE) { 5936 andl(value, 0xff); 5937 movl(rtmp, value); 5938 shll(rtmp, 8); 5939 orl(value, rtmp); 5940 } 5941 if (t == T_SHORT) { 5942 andl(value, 0xffff); 5943 } 5944 if (t == T_BYTE || t == T_SHORT) { 5945 movl(rtmp, value); 5946 shll(rtmp, 16); 5947 orl(value, rtmp); 5948 } 5949 5950 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 5951 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp 5952 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { 5953 Label L_skip_align2; 5954 // align source address at 4 bytes address boundary 5955 if (t == T_BYTE) { 5956 Label L_skip_align1; 5957 // One byte misalignment happens only for byte arrays 5958 testptr(to, 1); 5959 jccb(Assembler::zero, L_skip_align1); 5960 movb(Address(to, 0), value); 5961 increment(to); 5962 decrement(count); 5963 BIND(L_skip_align1); 5964 } 5965 // Two bytes misalignment happens only for byte and short (char) arrays 5966 testptr(to, 2); 5967 jccb(Assembler::zero, L_skip_align2); 5968 movw(Address(to, 0), value); 5969 addptr(to, 2); 5970 subl(count, 1<<(shift-1)); 5971 BIND(L_skip_align2); 5972 } 5973 if (UseSSE < 2) { 5974 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 5975 // Fill 32-byte chunks 5976 subl(count, 8 << shift); 5977 jcc(Assembler::less, L_check_fill_8_bytes); 5978 align(16); 5979 5980 BIND(L_fill_32_bytes_loop); 5981 5982 for (int i = 0; i < 32; i += 4) { 5983 movl(Address(to, i), value); 5984 } 5985 5986 addptr(to, 32); 5987 subl(count, 8 << shift); 5988 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 5989 BIND(L_check_fill_8_bytes); 5990 addl(count, 8 << shift); 5991 jccb(Assembler::zero, L_exit); 5992 jmpb(L_fill_8_bytes); 5993 5994 // 5995 // length is too short, just fill qwords 5996 // 5997 BIND(L_fill_8_bytes_loop); 5998 movl(Address(to, 0), value); 5999 movl(Address(to, 4), value); 6000 addptr(to, 8); 6001 BIND(L_fill_8_bytes); 6002 subl(count, 1 << (shift + 1)); 6003 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 6004 // fall through to fill 4 bytes 6005 } else { 6006 Label L_fill_32_bytes; 6007 if (!UseUnalignedLoadStores) { 6008 // align to 8 bytes, we know we are 4 byte aligned to start 6009 testptr(to, 4); 6010 jccb(Assembler::zero, L_fill_32_bytes); 6011 movl(Address(to, 0), value); 6012 addptr(to, 4); 6013 subl(count, 1<<shift); 6014 } 6015 BIND(L_fill_32_bytes); 6016 { 6017 assert( UseSSE >= 2, "supported cpu only" ); 6018 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 6019 movdl(xtmp, value); 6020 if (UseAVX >= 2 && UseUnalignedLoadStores) { 6021 Label L_check_fill_32_bytes; 6022 if (UseAVX > 2) { 6023 // Fill 64-byte chunks 6024 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2; 6025 6026 // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2 6027 cmpl(count, VM_Version::avx3_threshold()); 6028 jccb(Assembler::below, L_check_fill_64_bytes_avx2); 6029 6030 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); 6031 6032 subl(count, 16 << shift); 6033 jccb(Assembler::less, L_check_fill_32_bytes); 6034 align(16); 6035 6036 BIND(L_fill_64_bytes_loop_avx3); 6037 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit); 6038 addptr(to, 64); 6039 subl(count, 16 << shift); 6040 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3); 6041 jmpb(L_check_fill_32_bytes); 6042 6043 BIND(L_check_fill_64_bytes_avx2); 6044 } 6045 // Fill 64-byte chunks 6046 Label L_fill_64_bytes_loop; 6047 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit); 6048 6049 subl(count, 16 << shift); 6050 jcc(Assembler::less, L_check_fill_32_bytes); 6051 align(16); 6052 6053 BIND(L_fill_64_bytes_loop); 6054 vmovdqu(Address(to, 0), xtmp); 6055 vmovdqu(Address(to, 32), xtmp); 6056 addptr(to, 64); 6057 subl(count, 16 << shift); 6058 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); 6059 6060 BIND(L_check_fill_32_bytes); 6061 addl(count, 8 << shift); 6062 jccb(Assembler::less, L_check_fill_8_bytes); 6063 vmovdqu(Address(to, 0), xtmp); 6064 addptr(to, 32); 6065 subl(count, 8 << shift); 6066 6067 BIND(L_check_fill_8_bytes); 6068 // clean upper bits of YMM registers 6069 movdl(xtmp, value); 6070 pshufd(xtmp, xtmp, 0); 6071 } else { 6072 // Fill 32-byte chunks 6073 pshufd(xtmp, xtmp, 0); 6074 6075 subl(count, 8 << shift); 6076 jcc(Assembler::less, L_check_fill_8_bytes); 6077 align(16); 6078 6079 BIND(L_fill_32_bytes_loop); 6080 6081 if (UseUnalignedLoadStores) { 6082 movdqu(Address(to, 0), xtmp); 6083 movdqu(Address(to, 16), xtmp); 6084 } else { 6085 movq(Address(to, 0), xtmp); 6086 movq(Address(to, 8), xtmp); 6087 movq(Address(to, 16), xtmp); 6088 movq(Address(to, 24), xtmp); 6089 } 6090 6091 addptr(to, 32); 6092 subl(count, 8 << shift); 6093 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 6094 6095 BIND(L_check_fill_8_bytes); 6096 } 6097 addl(count, 8 << shift); 6098 jccb(Assembler::zero, L_exit); 6099 jmpb(L_fill_8_bytes); 6100 6101 // 6102 // length is too short, just fill qwords 6103 // 6104 BIND(L_fill_8_bytes_loop); 6105 movq(Address(to, 0), xtmp); 6106 addptr(to, 8); 6107 BIND(L_fill_8_bytes); 6108 subl(count, 1 << (shift + 1)); 6109 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 6110 } 6111 } 6112 // fill trailing 4 bytes 6113 BIND(L_fill_4_bytes); 6114 testl(count, 1<<shift); 6115 jccb(Assembler::zero, L_fill_2_bytes); 6116 movl(Address(to, 0), value); 6117 if (t == T_BYTE || t == T_SHORT) { 6118 Label L_fill_byte; 6119 addptr(to, 4); 6120 BIND(L_fill_2_bytes); 6121 // fill trailing 2 bytes 6122 testl(count, 1<<(shift-1)); 6123 jccb(Assembler::zero, L_fill_byte); 6124 movw(Address(to, 0), value); 6125 if (t == T_BYTE) { 6126 addptr(to, 2); 6127 BIND(L_fill_byte); 6128 // fill trailing byte 6129 testl(count, 1); 6130 jccb(Assembler::zero, L_exit); 6131 movb(Address(to, 0), value); 6132 } else { 6133 BIND(L_fill_byte); 6134 } 6135 } else { 6136 BIND(L_fill_2_bytes); 6137 } 6138 BIND(L_exit); 6139 } 6140 6141 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) { 6142 switch(type) { 6143 case T_BYTE: 6144 case T_BOOLEAN: 6145 evpbroadcastb(dst, src, vector_len); 6146 break; 6147 case T_SHORT: 6148 case T_CHAR: 6149 evpbroadcastw(dst, src, vector_len); 6150 break; 6151 case T_INT: 6152 case T_FLOAT: 6153 evpbroadcastd(dst, src, vector_len); 6154 break; 6155 case T_LONG: 6156 case T_DOUBLE: 6157 evpbroadcastq(dst, src, vector_len); 6158 break; 6159 default: 6160 fatal("Unhandled type : %s", type2name(type)); 6161 break; 6162 } 6163 } 6164 6165 // encode char[] to byte[] in ISO_8859_1 or ASCII 6166 //@IntrinsicCandidate 6167 //private static int implEncodeISOArray(byte[] sa, int sp, 6168 //byte[] da, int dp, int len) { 6169 // int i = 0; 6170 // for (; i < len; i++) { 6171 // char c = StringUTF16.getChar(sa, sp++); 6172 // if (c > '\u00FF') 6173 // break; 6174 // da[dp++] = (byte)c; 6175 // } 6176 // return i; 6177 //} 6178 // 6179 //@IntrinsicCandidate 6180 //private static int implEncodeAsciiArray(char[] sa, int sp, 6181 // byte[] da, int dp, int len) { 6182 // int i = 0; 6183 // for (; i < len; i++) { 6184 // char c = sa[sp++]; 6185 // if (c >= '\u0080') 6186 // break; 6187 // da[dp++] = (byte)c; 6188 // } 6189 // return i; 6190 //} 6191 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, 6192 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 6193 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 6194 Register tmp5, Register result, bool ascii) { 6195 6196 // rsi: src 6197 // rdi: dst 6198 // rdx: len 6199 // rcx: tmp5 6200 // rax: result 6201 ShortBranchVerifier sbv(this); 6202 assert_different_registers(src, dst, len, tmp5, result); 6203 Label L_done, L_copy_1_char, L_copy_1_char_exit; 6204 6205 int mask = ascii ? 0xff80ff80 : 0xff00ff00; 6206 int short_mask = ascii ? 0xff80 : 0xff00; 6207 6208 // set result 6209 xorl(result, result); 6210 // check for zero length 6211 testl(len, len); 6212 jcc(Assembler::zero, L_done); 6213 6214 movl(result, len); 6215 6216 // Setup pointers 6217 lea(src, Address(src, len, Address::times_2)); // char[] 6218 lea(dst, Address(dst, len, Address::times_1)); // byte[] 6219 negptr(len); 6220 6221 if (UseSSE42Intrinsics || UseAVX >= 2) { 6222 Label L_copy_8_chars, L_copy_8_chars_exit; 6223 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit; 6224 6225 if (UseAVX >= 2) { 6226 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit; 6227 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector 6228 movdl(tmp1Reg, tmp5); 6229 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit); 6230 jmp(L_chars_32_check); 6231 6232 bind(L_copy_32_chars); 6233 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64)); 6234 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32)); 6235 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 6236 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector 6237 jccb(Assembler::notZero, L_copy_32_chars_exit); 6238 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 6239 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1); 6240 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg); 6241 6242 bind(L_chars_32_check); 6243 addptr(len, 32); 6244 jcc(Assembler::lessEqual, L_copy_32_chars); 6245 6246 bind(L_copy_32_chars_exit); 6247 subptr(len, 16); 6248 jccb(Assembler::greater, L_copy_16_chars_exit); 6249 6250 } else if (UseSSE42Intrinsics) { 6251 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector 6252 movdl(tmp1Reg, tmp5); 6253 pshufd(tmp1Reg, tmp1Reg, 0); 6254 jmpb(L_chars_16_check); 6255 } 6256 6257 bind(L_copy_16_chars); 6258 if (UseAVX >= 2) { 6259 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32)); 6260 vptest(tmp2Reg, tmp1Reg); 6261 jcc(Assembler::notZero, L_copy_16_chars_exit); 6262 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1); 6263 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1); 6264 } else { 6265 if (UseAVX > 0) { 6266 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 6267 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 6268 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0); 6269 } else { 6270 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 6271 por(tmp2Reg, tmp3Reg); 6272 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 6273 por(tmp2Reg, tmp4Reg); 6274 } 6275 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector 6276 jccb(Assembler::notZero, L_copy_16_chars_exit); 6277 packuswb(tmp3Reg, tmp4Reg); 6278 } 6279 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg); 6280 6281 bind(L_chars_16_check); 6282 addptr(len, 16); 6283 jcc(Assembler::lessEqual, L_copy_16_chars); 6284 6285 bind(L_copy_16_chars_exit); 6286 if (UseAVX >= 2) { 6287 // clean upper bits of YMM registers 6288 vpxor(tmp2Reg, tmp2Reg); 6289 vpxor(tmp3Reg, tmp3Reg); 6290 vpxor(tmp4Reg, tmp4Reg); 6291 movdl(tmp1Reg, tmp5); 6292 pshufd(tmp1Reg, tmp1Reg, 0); 6293 } 6294 subptr(len, 8); 6295 jccb(Assembler::greater, L_copy_8_chars_exit); 6296 6297 bind(L_copy_8_chars); 6298 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16)); 6299 ptest(tmp3Reg, tmp1Reg); 6300 jccb(Assembler::notZero, L_copy_8_chars_exit); 6301 packuswb(tmp3Reg, tmp1Reg); 6302 movq(Address(dst, len, Address::times_1, -8), tmp3Reg); 6303 addptr(len, 8); 6304 jccb(Assembler::lessEqual, L_copy_8_chars); 6305 6306 bind(L_copy_8_chars_exit); 6307 subptr(len, 8); 6308 jccb(Assembler::zero, L_done); 6309 } 6310 6311 bind(L_copy_1_char); 6312 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0)); 6313 testl(tmp5, short_mask); // check if Unicode or non-ASCII char 6314 jccb(Assembler::notZero, L_copy_1_char_exit); 6315 movb(Address(dst, len, Address::times_1, 0), tmp5); 6316 addptr(len, 1); 6317 jccb(Assembler::less, L_copy_1_char); 6318 6319 bind(L_copy_1_char_exit); 6320 addptr(result, len); // len is negative count of not processed elements 6321 6322 bind(L_done); 6323 } 6324 6325 #ifdef _LP64 6326 /** 6327 * Helper for multiply_to_len(). 6328 */ 6329 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { 6330 addq(dest_lo, src1); 6331 adcq(dest_hi, 0); 6332 addq(dest_lo, src2); 6333 adcq(dest_hi, 0); 6334 } 6335 6336 /** 6337 * Multiply 64 bit by 64 bit first loop. 6338 */ 6339 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 6340 Register y, Register y_idx, Register z, 6341 Register carry, Register product, 6342 Register idx, Register kdx) { 6343 // 6344 // jlong carry, x[], y[], z[]; 6345 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 6346 // huge_128 product = y[idx] * x[xstart] + carry; 6347 // z[kdx] = (jlong)product; 6348 // carry = (jlong)(product >>> 64); 6349 // } 6350 // z[xstart] = carry; 6351 // 6352 6353 Label L_first_loop, L_first_loop_exit; 6354 Label L_one_x, L_one_y, L_multiply; 6355 6356 decrementl(xstart); 6357 jcc(Assembler::negative, L_one_x); 6358 6359 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 6360 rorq(x_xstart, 32); // convert big-endian to little-endian 6361 6362 bind(L_first_loop); 6363 decrementl(idx); 6364 jcc(Assembler::negative, L_first_loop_exit); 6365 decrementl(idx); 6366 jcc(Assembler::negative, L_one_y); 6367 movq(y_idx, Address(y, idx, Address::times_4, 0)); 6368 rorq(y_idx, 32); // convert big-endian to little-endian 6369 bind(L_multiply); 6370 movq(product, x_xstart); 6371 mulq(y_idx); // product(rax) * y_idx -> rdx:rax 6372 addq(product, carry); 6373 adcq(rdx, 0); 6374 subl(kdx, 2); 6375 movl(Address(z, kdx, Address::times_4, 4), product); 6376 shrq(product, 32); 6377 movl(Address(z, kdx, Address::times_4, 0), product); 6378 movq(carry, rdx); 6379 jmp(L_first_loop); 6380 6381 bind(L_one_y); 6382 movl(y_idx, Address(y, 0)); 6383 jmp(L_multiply); 6384 6385 bind(L_one_x); 6386 movl(x_xstart, Address(x, 0)); 6387 jmp(L_first_loop); 6388 6389 bind(L_first_loop_exit); 6390 } 6391 6392 /** 6393 * Multiply 64 bit by 64 bit and add 128 bit. 6394 */ 6395 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, 6396 Register yz_idx, Register idx, 6397 Register carry, Register product, int offset) { 6398 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 6399 // z[kdx] = (jlong)product; 6400 6401 movq(yz_idx, Address(y, idx, Address::times_4, offset)); 6402 rorq(yz_idx, 32); // convert big-endian to little-endian 6403 movq(product, x_xstart); 6404 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 6405 movq(yz_idx, Address(z, idx, Address::times_4, offset)); 6406 rorq(yz_idx, 32); // convert big-endian to little-endian 6407 6408 add2_with_carry(rdx, product, carry, yz_idx); 6409 6410 movl(Address(z, idx, Address::times_4, offset+4), product); 6411 shrq(product, 32); 6412 movl(Address(z, idx, Address::times_4, offset), product); 6413 6414 } 6415 6416 /** 6417 * Multiply 128 bit by 128 bit. Unrolled inner loop. 6418 */ 6419 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, 6420 Register yz_idx, Register idx, Register jdx, 6421 Register carry, Register product, 6422 Register carry2) { 6423 // jlong carry, x[], y[], z[]; 6424 // int kdx = ystart+1; 6425 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 6426 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 6427 // z[kdx+idx+1] = (jlong)product; 6428 // jlong carry2 = (jlong)(product >>> 64); 6429 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 6430 // z[kdx+idx] = (jlong)product; 6431 // carry = (jlong)(product >>> 64); 6432 // } 6433 // idx += 2; 6434 // if (idx > 0) { 6435 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 6436 // z[kdx+idx] = (jlong)product; 6437 // carry = (jlong)(product >>> 64); 6438 // } 6439 // 6440 6441 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 6442 6443 movl(jdx, idx); 6444 andl(jdx, 0xFFFFFFFC); 6445 shrl(jdx, 2); 6446 6447 bind(L_third_loop); 6448 subl(jdx, 1); 6449 jcc(Assembler::negative, L_third_loop_exit); 6450 subl(idx, 4); 6451 6452 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); 6453 movq(carry2, rdx); 6454 6455 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); 6456 movq(carry, rdx); 6457 jmp(L_third_loop); 6458 6459 bind (L_third_loop_exit); 6460 6461 andl (idx, 0x3); 6462 jcc(Assembler::zero, L_post_third_loop_done); 6463 6464 Label L_check_1; 6465 subl(idx, 2); 6466 jcc(Assembler::negative, L_check_1); 6467 6468 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); 6469 movq(carry, rdx); 6470 6471 bind (L_check_1); 6472 addl (idx, 0x2); 6473 andl (idx, 0x1); 6474 subl(idx, 1); 6475 jcc(Assembler::negative, L_post_third_loop_done); 6476 6477 movl(yz_idx, Address(y, idx, Address::times_4, 0)); 6478 movq(product, x_xstart); 6479 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 6480 movl(yz_idx, Address(z, idx, Address::times_4, 0)); 6481 6482 add2_with_carry(rdx, product, yz_idx, carry); 6483 6484 movl(Address(z, idx, Address::times_4, 0), product); 6485 shrq(product, 32); 6486 6487 shlq(rdx, 32); 6488 orq(product, rdx); 6489 movq(carry, product); 6490 6491 bind(L_post_third_loop_done); 6492 } 6493 6494 /** 6495 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. 6496 * 6497 */ 6498 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, 6499 Register carry, Register carry2, 6500 Register idx, Register jdx, 6501 Register yz_idx1, Register yz_idx2, 6502 Register tmp, Register tmp3, Register tmp4) { 6503 assert(UseBMI2Instructions, "should be used only when BMI2 is available"); 6504 6505 // jlong carry, x[], y[], z[]; 6506 // int kdx = ystart+1; 6507 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 6508 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; 6509 // jlong carry2 = (jlong)(tmp3 >>> 64); 6510 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; 6511 // carry = (jlong)(tmp4 >>> 64); 6512 // z[kdx+idx+1] = (jlong)tmp3; 6513 // z[kdx+idx] = (jlong)tmp4; 6514 // } 6515 // idx += 2; 6516 // if (idx > 0) { 6517 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; 6518 // z[kdx+idx] = (jlong)yz_idx1; 6519 // carry = (jlong)(yz_idx1 >>> 64); 6520 // } 6521 // 6522 6523 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 6524 6525 movl(jdx, idx); 6526 andl(jdx, 0xFFFFFFFC); 6527 shrl(jdx, 2); 6528 6529 bind(L_third_loop); 6530 subl(jdx, 1); 6531 jcc(Assembler::negative, L_third_loop_exit); 6532 subl(idx, 4); 6533 6534 movq(yz_idx1, Address(y, idx, Address::times_4, 8)); 6535 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 6536 movq(yz_idx2, Address(y, idx, Address::times_4, 0)); 6537 rorxq(yz_idx2, yz_idx2, 32); 6538 6539 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 6540 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp 6541 6542 movq(yz_idx1, Address(z, idx, Address::times_4, 8)); 6543 rorxq(yz_idx1, yz_idx1, 32); 6544 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 6545 rorxq(yz_idx2, yz_idx2, 32); 6546 6547 if (VM_Version::supports_adx()) { 6548 adcxq(tmp3, carry); 6549 adoxq(tmp3, yz_idx1); 6550 6551 adcxq(tmp4, tmp); 6552 adoxq(tmp4, yz_idx2); 6553 6554 movl(carry, 0); // does not affect flags 6555 adcxq(carry2, carry); 6556 adoxq(carry2, carry); 6557 } else { 6558 add2_with_carry(tmp4, tmp3, carry, yz_idx1); 6559 add2_with_carry(carry2, tmp4, tmp, yz_idx2); 6560 } 6561 movq(carry, carry2); 6562 6563 movl(Address(z, idx, Address::times_4, 12), tmp3); 6564 shrq(tmp3, 32); 6565 movl(Address(z, idx, Address::times_4, 8), tmp3); 6566 6567 movl(Address(z, idx, Address::times_4, 4), tmp4); 6568 shrq(tmp4, 32); 6569 movl(Address(z, idx, Address::times_4, 0), tmp4); 6570 6571 jmp(L_third_loop); 6572 6573 bind (L_third_loop_exit); 6574 6575 andl (idx, 0x3); 6576 jcc(Assembler::zero, L_post_third_loop_done); 6577 6578 Label L_check_1; 6579 subl(idx, 2); 6580 jcc(Assembler::negative, L_check_1); 6581 6582 movq(yz_idx1, Address(y, idx, Address::times_4, 0)); 6583 rorxq(yz_idx1, yz_idx1, 32); 6584 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 6585 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 6586 rorxq(yz_idx2, yz_idx2, 32); 6587 6588 add2_with_carry(tmp4, tmp3, carry, yz_idx2); 6589 6590 movl(Address(z, idx, Address::times_4, 4), tmp3); 6591 shrq(tmp3, 32); 6592 movl(Address(z, idx, Address::times_4, 0), tmp3); 6593 movq(carry, tmp4); 6594 6595 bind (L_check_1); 6596 addl (idx, 0x2); 6597 andl (idx, 0x1); 6598 subl(idx, 1); 6599 jcc(Assembler::negative, L_post_third_loop_done); 6600 movl(tmp4, Address(y, idx, Address::times_4, 0)); 6601 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 6602 movl(tmp4, Address(z, idx, Address::times_4, 0)); 6603 6604 add2_with_carry(carry2, tmp3, tmp4, carry); 6605 6606 movl(Address(z, idx, Address::times_4, 0), tmp3); 6607 shrq(tmp3, 32); 6608 6609 shlq(carry2, 32); 6610 orq(tmp3, carry2); 6611 movq(carry, tmp3); 6612 6613 bind(L_post_third_loop_done); 6614 } 6615 6616 /** 6617 * Code for BigInteger::multiplyToLen() intrinsic. 6618 * 6619 * rdi: x 6620 * rax: xlen 6621 * rsi: y 6622 * rcx: ylen 6623 * r8: z 6624 * r11: zlen 6625 * r12: tmp1 6626 * r13: tmp2 6627 * r14: tmp3 6628 * r15: tmp4 6629 * rbx: tmp5 6630 * 6631 */ 6632 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, 6633 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 6634 ShortBranchVerifier sbv(this); 6635 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); 6636 6637 push(tmp1); 6638 push(tmp2); 6639 push(tmp3); 6640 push(tmp4); 6641 push(tmp5); 6642 6643 push(xlen); 6644 push(zlen); 6645 6646 const Register idx = tmp1; 6647 const Register kdx = tmp2; 6648 const Register xstart = tmp3; 6649 6650 const Register y_idx = tmp4; 6651 const Register carry = tmp5; 6652 const Register product = xlen; 6653 const Register x_xstart = zlen; // reuse register 6654 6655 // First Loop. 6656 // 6657 // final static long LONG_MASK = 0xffffffffL; 6658 // int xstart = xlen - 1; 6659 // int ystart = ylen - 1; 6660 // long carry = 0; 6661 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 6662 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 6663 // z[kdx] = (int)product; 6664 // carry = product >>> 32; 6665 // } 6666 // z[xstart] = (int)carry; 6667 // 6668 6669 movl(idx, ylen); // idx = ylen; 6670 movl(kdx, zlen); // kdx = xlen+ylen; 6671 xorq(carry, carry); // carry = 0; 6672 6673 Label L_done; 6674 6675 movl(xstart, xlen); 6676 decrementl(xstart); 6677 jcc(Assembler::negative, L_done); 6678 6679 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 6680 6681 Label L_second_loop; 6682 testl(kdx, kdx); 6683 jcc(Assembler::zero, L_second_loop); 6684 6685 Label L_carry; 6686 subl(kdx, 1); 6687 jcc(Assembler::zero, L_carry); 6688 6689 movl(Address(z, kdx, Address::times_4, 0), carry); 6690 shrq(carry, 32); 6691 subl(kdx, 1); 6692 6693 bind(L_carry); 6694 movl(Address(z, kdx, Address::times_4, 0), carry); 6695 6696 // Second and third (nested) loops. 6697 // 6698 // for (int i = xstart-1; i >= 0; i--) { // Second loop 6699 // carry = 0; 6700 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 6701 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 6702 // (z[k] & LONG_MASK) + carry; 6703 // z[k] = (int)product; 6704 // carry = product >>> 32; 6705 // } 6706 // z[i] = (int)carry; 6707 // } 6708 // 6709 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 6710 6711 const Register jdx = tmp1; 6712 6713 bind(L_second_loop); 6714 xorl(carry, carry); // carry = 0; 6715 movl(jdx, ylen); // j = ystart+1 6716 6717 subl(xstart, 1); // i = xstart-1; 6718 jcc(Assembler::negative, L_done); 6719 6720 push (z); 6721 6722 Label L_last_x; 6723 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j 6724 subl(xstart, 1); // i = xstart-1; 6725 jcc(Assembler::negative, L_last_x); 6726 6727 if (UseBMI2Instructions) { 6728 movq(rdx, Address(x, xstart, Address::times_4, 0)); 6729 rorxq(rdx, rdx, 32); // convert big-endian to little-endian 6730 } else { 6731 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 6732 rorq(x_xstart, 32); // convert big-endian to little-endian 6733 } 6734 6735 Label L_third_loop_prologue; 6736 bind(L_third_loop_prologue); 6737 6738 push (x); 6739 push (xstart); 6740 push (ylen); 6741 6742 6743 if (UseBMI2Instructions) { 6744 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); 6745 } else { // !UseBMI2Instructions 6746 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); 6747 } 6748 6749 pop(ylen); 6750 pop(xlen); 6751 pop(x); 6752 pop(z); 6753 6754 movl(tmp3, xlen); 6755 addl(tmp3, 1); 6756 movl(Address(z, tmp3, Address::times_4, 0), carry); 6757 subl(tmp3, 1); 6758 jccb(Assembler::negative, L_done); 6759 6760 shrq(carry, 32); 6761 movl(Address(z, tmp3, Address::times_4, 0), carry); 6762 jmp(L_second_loop); 6763 6764 // Next infrequent code is moved outside loops. 6765 bind(L_last_x); 6766 if (UseBMI2Instructions) { 6767 movl(rdx, Address(x, 0)); 6768 } else { 6769 movl(x_xstart, Address(x, 0)); 6770 } 6771 jmp(L_third_loop_prologue); 6772 6773 bind(L_done); 6774 6775 pop(zlen); 6776 pop(xlen); 6777 6778 pop(tmp5); 6779 pop(tmp4); 6780 pop(tmp3); 6781 pop(tmp2); 6782 pop(tmp1); 6783 } 6784 6785 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale, 6786 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){ 6787 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled."); 6788 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP; 6789 Label VECTOR8_TAIL, VECTOR4_TAIL; 6790 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL; 6791 Label SAME_TILL_END, DONE; 6792 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL; 6793 6794 //scale is in rcx in both Win64 and Unix 6795 ShortBranchVerifier sbv(this); 6796 6797 shlq(length); 6798 xorq(result, result); 6799 6800 if ((AVX3Threshold == 0) && (UseAVX > 2) && 6801 VM_Version::supports_avx512vlbw()) { 6802 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL; 6803 6804 cmpq(length, 64); 6805 jcc(Assembler::less, VECTOR32_TAIL); 6806 6807 movq(tmp1, length); 6808 andq(tmp1, 0x3F); // tail count 6809 andq(length, ~(0x3F)); //vector count 6810 6811 bind(VECTOR64_LOOP); 6812 // AVX512 code to compare 64 byte vectors. 6813 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit); 6814 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit); 6815 kortestql(k7, k7); 6816 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch 6817 addq(result, 64); 6818 subq(length, 64); 6819 jccb(Assembler::notZero, VECTOR64_LOOP); 6820 6821 //bind(VECTOR64_TAIL); 6822 testq(tmp1, tmp1); 6823 jcc(Assembler::zero, SAME_TILL_END); 6824 6825 //bind(VECTOR64_TAIL); 6826 // AVX512 code to compare up to 63 byte vectors. 6827 mov64(tmp2, 0xFFFFFFFFFFFFFFFF); 6828 shlxq(tmp2, tmp2, tmp1); 6829 notq(tmp2); 6830 kmovql(k3, tmp2); 6831 6832 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit); 6833 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit); 6834 6835 ktestql(k7, k3); 6836 jcc(Assembler::below, SAME_TILL_END); // not mismatch 6837 6838 bind(VECTOR64_NOT_EQUAL); 6839 kmovql(tmp1, k7); 6840 notq(tmp1); 6841 tzcntq(tmp1, tmp1); 6842 addq(result, tmp1); 6843 shrq(result); 6844 jmp(DONE); 6845 bind(VECTOR32_TAIL); 6846 } 6847 6848 cmpq(length, 8); 6849 jcc(Assembler::equal, VECTOR8_LOOP); 6850 jcc(Assembler::less, VECTOR4_TAIL); 6851 6852 if (UseAVX >= 2) { 6853 Label VECTOR16_TAIL, VECTOR32_LOOP; 6854 6855 cmpq(length, 16); 6856 jcc(Assembler::equal, VECTOR16_LOOP); 6857 jcc(Assembler::less, VECTOR8_LOOP); 6858 6859 cmpq(length, 32); 6860 jccb(Assembler::less, VECTOR16_TAIL); 6861 6862 subq(length, 32); 6863 bind(VECTOR32_LOOP); 6864 vmovdqu(rymm0, Address(obja, result)); 6865 vmovdqu(rymm1, Address(objb, result)); 6866 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit); 6867 vptest(rymm2, rymm2); 6868 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found 6869 addq(result, 32); 6870 subq(length, 32); 6871 jcc(Assembler::greaterEqual, VECTOR32_LOOP); 6872 addq(length, 32); 6873 jcc(Assembler::equal, SAME_TILL_END); 6874 //falling through if less than 32 bytes left //close the branch here. 6875 6876 bind(VECTOR16_TAIL); 6877 cmpq(length, 16); 6878 jccb(Assembler::less, VECTOR8_TAIL); 6879 bind(VECTOR16_LOOP); 6880 movdqu(rymm0, Address(obja, result)); 6881 movdqu(rymm1, Address(objb, result)); 6882 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit); 6883 ptest(rymm2, rymm2); 6884 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 6885 addq(result, 16); 6886 subq(length, 16); 6887 jcc(Assembler::equal, SAME_TILL_END); 6888 //falling through if less than 16 bytes left 6889 } else {//regular intrinsics 6890 6891 cmpq(length, 16); 6892 jccb(Assembler::less, VECTOR8_TAIL); 6893 6894 subq(length, 16); 6895 bind(VECTOR16_LOOP); 6896 movdqu(rymm0, Address(obja, result)); 6897 movdqu(rymm1, Address(objb, result)); 6898 pxor(rymm0, rymm1); 6899 ptest(rymm0, rymm0); 6900 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 6901 addq(result, 16); 6902 subq(length, 16); 6903 jccb(Assembler::greaterEqual, VECTOR16_LOOP); 6904 addq(length, 16); 6905 jcc(Assembler::equal, SAME_TILL_END); 6906 //falling through if less than 16 bytes left 6907 } 6908 6909 bind(VECTOR8_TAIL); 6910 cmpq(length, 8); 6911 jccb(Assembler::less, VECTOR4_TAIL); 6912 bind(VECTOR8_LOOP); 6913 movq(tmp1, Address(obja, result)); 6914 movq(tmp2, Address(objb, result)); 6915 xorq(tmp1, tmp2); 6916 testq(tmp1, tmp1); 6917 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found 6918 addq(result, 8); 6919 subq(length, 8); 6920 jcc(Assembler::equal, SAME_TILL_END); 6921 //falling through if less than 8 bytes left 6922 6923 bind(VECTOR4_TAIL); 6924 cmpq(length, 4); 6925 jccb(Assembler::less, BYTES_TAIL); 6926 bind(VECTOR4_LOOP); 6927 movl(tmp1, Address(obja, result)); 6928 xorl(tmp1, Address(objb, result)); 6929 testl(tmp1, tmp1); 6930 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found 6931 addq(result, 4); 6932 subq(length, 4); 6933 jcc(Assembler::equal, SAME_TILL_END); 6934 //falling through if less than 4 bytes left 6935 6936 bind(BYTES_TAIL); 6937 bind(BYTES_LOOP); 6938 load_unsigned_byte(tmp1, Address(obja, result)); 6939 load_unsigned_byte(tmp2, Address(objb, result)); 6940 xorl(tmp1, tmp2); 6941 testl(tmp1, tmp1); 6942 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 6943 decq(length); 6944 jcc(Assembler::zero, SAME_TILL_END); 6945 incq(result); 6946 load_unsigned_byte(tmp1, Address(obja, result)); 6947 load_unsigned_byte(tmp2, Address(objb, result)); 6948 xorl(tmp1, tmp2); 6949 testl(tmp1, tmp1); 6950 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 6951 decq(length); 6952 jcc(Assembler::zero, SAME_TILL_END); 6953 incq(result); 6954 load_unsigned_byte(tmp1, Address(obja, result)); 6955 load_unsigned_byte(tmp2, Address(objb, result)); 6956 xorl(tmp1, tmp2); 6957 testl(tmp1, tmp1); 6958 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 6959 jmp(SAME_TILL_END); 6960 6961 if (UseAVX >= 2) { 6962 bind(VECTOR32_NOT_EQUAL); 6963 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit); 6964 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit); 6965 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit); 6966 vpmovmskb(tmp1, rymm0); 6967 bsfq(tmp1, tmp1); 6968 addq(result, tmp1); 6969 shrq(result); 6970 jmp(DONE); 6971 } 6972 6973 bind(VECTOR16_NOT_EQUAL); 6974 if (UseAVX >= 2) { 6975 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit); 6976 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit); 6977 pxor(rymm0, rymm2); 6978 } else { 6979 pcmpeqb(rymm2, rymm2); 6980 pxor(rymm0, rymm1); 6981 pcmpeqb(rymm0, rymm1); 6982 pxor(rymm0, rymm2); 6983 } 6984 pmovmskb(tmp1, rymm0); 6985 bsfq(tmp1, tmp1); 6986 addq(result, tmp1); 6987 shrq(result); 6988 jmpb(DONE); 6989 6990 bind(VECTOR8_NOT_EQUAL); 6991 bind(VECTOR4_NOT_EQUAL); 6992 bsfq(tmp1, tmp1); 6993 shrq(tmp1, 3); 6994 addq(result, tmp1); 6995 bind(BYTES_NOT_EQUAL); 6996 shrq(result); 6997 jmpb(DONE); 6998 6999 bind(SAME_TILL_END); 7000 mov64(result, -1); 7001 7002 bind(DONE); 7003 } 7004 7005 //Helper functions for square_to_len() 7006 7007 /** 7008 * Store the squares of x[], right shifted one bit (divided by 2) into z[] 7009 * Preserves x and z and modifies rest of the registers. 7010 */ 7011 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7012 // Perform square and right shift by 1 7013 // Handle odd xlen case first, then for even xlen do the following 7014 // jlong carry = 0; 7015 // for (int j=0, i=0; j < xlen; j+=2, i+=4) { 7016 // huge_128 product = x[j:j+1] * x[j:j+1]; 7017 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); 7018 // z[i+2:i+3] = (jlong)(product >>> 1); 7019 // carry = (jlong)product; 7020 // } 7021 7022 xorq(tmp5, tmp5); // carry 7023 xorq(rdxReg, rdxReg); 7024 xorl(tmp1, tmp1); // index for x 7025 xorl(tmp4, tmp4); // index for z 7026 7027 Label L_first_loop, L_first_loop_exit; 7028 7029 testl(xlen, 1); 7030 jccb(Assembler::zero, L_first_loop); //jump if xlen is even 7031 7032 // Square and right shift by 1 the odd element using 32 bit multiply 7033 movl(raxReg, Address(x, tmp1, Address::times_4, 0)); 7034 imulq(raxReg, raxReg); 7035 shrq(raxReg, 1); 7036 adcq(tmp5, 0); 7037 movq(Address(z, tmp4, Address::times_4, 0), raxReg); 7038 incrementl(tmp1); 7039 addl(tmp4, 2); 7040 7041 // Square and right shift by 1 the rest using 64 bit multiply 7042 bind(L_first_loop); 7043 cmpptr(tmp1, xlen); 7044 jccb(Assembler::equal, L_first_loop_exit); 7045 7046 // Square 7047 movq(raxReg, Address(x, tmp1, Address::times_4, 0)); 7048 rorq(raxReg, 32); // convert big-endian to little-endian 7049 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax 7050 7051 // Right shift by 1 and save carry 7052 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 7053 rcrq(rdxReg, 1); 7054 rcrq(raxReg, 1); 7055 adcq(tmp5, 0); 7056 7057 // Store result in z 7058 movq(Address(z, tmp4, Address::times_4, 0), rdxReg); 7059 movq(Address(z, tmp4, Address::times_4, 8), raxReg); 7060 7061 // Update indices for x and z 7062 addl(tmp1, 2); 7063 addl(tmp4, 4); 7064 jmp(L_first_loop); 7065 7066 bind(L_first_loop_exit); 7067 } 7068 7069 7070 /** 7071 * Perform the following multiply add operation using BMI2 instructions 7072 * carry:sum = sum + op1*op2 + carry 7073 * op2 should be in rdx 7074 * op2 is preserved, all other registers are modified 7075 */ 7076 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { 7077 // assert op2 is rdx 7078 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 7079 addq(sum, carry); 7080 adcq(tmp2, 0); 7081 addq(sum, op1); 7082 adcq(tmp2, 0); 7083 movq(carry, tmp2); 7084 } 7085 7086 /** 7087 * Perform the following multiply add operation: 7088 * carry:sum = sum + op1*op2 + carry 7089 * Preserves op1, op2 and modifies rest of registers 7090 */ 7091 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { 7092 // rdx:rax = op1 * op2 7093 movq(raxReg, op2); 7094 mulq(op1); 7095 7096 // rdx:rax = sum + carry + rdx:rax 7097 addq(sum, carry); 7098 adcq(rdxReg, 0); 7099 addq(sum, raxReg); 7100 adcq(rdxReg, 0); 7101 7102 // carry:sum = rdx:sum 7103 movq(carry, rdxReg); 7104 } 7105 7106 /** 7107 * Add 64 bit long carry into z[] with carry propagation. 7108 * Preserves z and carry register values and modifies rest of registers. 7109 * 7110 */ 7111 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { 7112 Label L_fourth_loop, L_fourth_loop_exit; 7113 7114 movl(tmp1, 1); 7115 subl(zlen, 2); 7116 addq(Address(z, zlen, Address::times_4, 0), carry); 7117 7118 bind(L_fourth_loop); 7119 jccb(Assembler::carryClear, L_fourth_loop_exit); 7120 subl(zlen, 2); 7121 jccb(Assembler::negative, L_fourth_loop_exit); 7122 addq(Address(z, zlen, Address::times_4, 0), tmp1); 7123 jmp(L_fourth_loop); 7124 bind(L_fourth_loop_exit); 7125 } 7126 7127 /** 7128 * Shift z[] left by 1 bit. 7129 * Preserves x, len, z and zlen registers and modifies rest of the registers. 7130 * 7131 */ 7132 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 7133 7134 Label L_fifth_loop, L_fifth_loop_exit; 7135 7136 // Fifth loop 7137 // Perform primitiveLeftShift(z, zlen, 1) 7138 7139 const Register prev_carry = tmp1; 7140 const Register new_carry = tmp4; 7141 const Register value = tmp2; 7142 const Register zidx = tmp3; 7143 7144 // int zidx, carry; 7145 // long value; 7146 // carry = 0; 7147 // for (zidx = zlen-2; zidx >=0; zidx -= 2) { 7148 // (carry:value) = (z[i] << 1) | carry ; 7149 // z[i] = value; 7150 // } 7151 7152 movl(zidx, zlen); 7153 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register 7154 7155 bind(L_fifth_loop); 7156 decl(zidx); // Use decl to preserve carry flag 7157 decl(zidx); 7158 jccb(Assembler::negative, L_fifth_loop_exit); 7159 7160 if (UseBMI2Instructions) { 7161 movq(value, Address(z, zidx, Address::times_4, 0)); 7162 rclq(value, 1); 7163 rorxq(value, value, 32); 7164 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7165 } 7166 else { 7167 // clear new_carry 7168 xorl(new_carry, new_carry); 7169 7170 // Shift z[i] by 1, or in previous carry and save new carry 7171 movq(value, Address(z, zidx, Address::times_4, 0)); 7172 shlq(value, 1); 7173 adcl(new_carry, 0); 7174 7175 orq(value, prev_carry); 7176 rorq(value, 0x20); 7177 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7178 7179 // Set previous carry = new carry 7180 movl(prev_carry, new_carry); 7181 } 7182 jmp(L_fifth_loop); 7183 7184 bind(L_fifth_loop_exit); 7185 } 7186 7187 7188 /** 7189 * Code for BigInteger::squareToLen() intrinsic 7190 * 7191 * rdi: x 7192 * rsi: len 7193 * r8: z 7194 * rcx: zlen 7195 * r12: tmp1 7196 * r13: tmp2 7197 * r14: tmp3 7198 * r15: tmp4 7199 * rbx: tmp5 7200 * 7201 */ 7202 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7203 7204 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply; 7205 push(tmp1); 7206 push(tmp2); 7207 push(tmp3); 7208 push(tmp4); 7209 push(tmp5); 7210 7211 // First loop 7212 // Store the squares, right shifted one bit (i.e., divided by 2). 7213 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); 7214 7215 // Add in off-diagonal sums. 7216 // 7217 // Second, third (nested) and fourth loops. 7218 // zlen +=2; 7219 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { 7220 // carry = 0; 7221 // long op2 = x[xidx:xidx+1]; 7222 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { 7223 // k -= 2; 7224 // long op1 = x[j:j+1]; 7225 // long sum = z[k:k+1]; 7226 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); 7227 // z[k:k+1] = sum; 7228 // } 7229 // add_one_64(z, k, carry, tmp_regs); 7230 // } 7231 7232 const Register carry = tmp5; 7233 const Register sum = tmp3; 7234 const Register op1 = tmp4; 7235 Register op2 = tmp2; 7236 7237 push(zlen); 7238 push(len); 7239 addl(zlen,2); 7240 bind(L_second_loop); 7241 xorq(carry, carry); 7242 subl(zlen, 4); 7243 subl(len, 2); 7244 push(zlen); 7245 push(len); 7246 cmpl(len, 0); 7247 jccb(Assembler::lessEqual, L_second_loop_exit); 7248 7249 // Multiply an array by one 64 bit long. 7250 if (UseBMI2Instructions) { 7251 op2 = rdxReg; 7252 movq(op2, Address(x, len, Address::times_4, 0)); 7253 rorxq(op2, op2, 32); 7254 } 7255 else { 7256 movq(op2, Address(x, len, Address::times_4, 0)); 7257 rorq(op2, 32); 7258 } 7259 7260 bind(L_third_loop); 7261 decrementl(len); 7262 jccb(Assembler::negative, L_third_loop_exit); 7263 decrementl(len); 7264 jccb(Assembler::negative, L_last_x); 7265 7266 movq(op1, Address(x, len, Address::times_4, 0)); 7267 rorq(op1, 32); 7268 7269 bind(L_multiply); 7270 subl(zlen, 2); 7271 movq(sum, Address(z, zlen, Address::times_4, 0)); 7272 7273 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. 7274 if (UseBMI2Instructions) { 7275 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); 7276 } 7277 else { 7278 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 7279 } 7280 7281 movq(Address(z, zlen, Address::times_4, 0), sum); 7282 7283 jmp(L_third_loop); 7284 bind(L_third_loop_exit); 7285 7286 // Fourth loop 7287 // Add 64 bit long carry into z with carry propagation. 7288 // Uses offsetted zlen. 7289 add_one_64(z, zlen, carry, tmp1); 7290 7291 pop(len); 7292 pop(zlen); 7293 jmp(L_second_loop); 7294 7295 // Next infrequent code is moved outside loops. 7296 bind(L_last_x); 7297 movl(op1, Address(x, 0)); 7298 jmp(L_multiply); 7299 7300 bind(L_second_loop_exit); 7301 pop(len); 7302 pop(zlen); 7303 pop(len); 7304 pop(zlen); 7305 7306 // Fifth loop 7307 // Shift z left 1 bit. 7308 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); 7309 7310 // z[zlen-1] |= x[len-1] & 1; 7311 movl(tmp3, Address(x, len, Address::times_4, -4)); 7312 andl(tmp3, 1); 7313 orl(Address(z, zlen, Address::times_4, -4), tmp3); 7314 7315 pop(tmp5); 7316 pop(tmp4); 7317 pop(tmp3); 7318 pop(tmp2); 7319 pop(tmp1); 7320 } 7321 7322 /** 7323 * Helper function for mul_add() 7324 * Multiply the in[] by int k and add to out[] starting at offset offs using 7325 * 128 bit by 32 bit multiply and return the carry in tmp5. 7326 * Only quad int aligned length of in[] is operated on in this function. 7327 * k is in rdxReg for BMI2Instructions, for others it is in tmp2. 7328 * This function preserves out, in and k registers. 7329 * len and offset point to the appropriate index in "in" & "out" correspondingly 7330 * tmp5 has the carry. 7331 * other registers are temporary and are modified. 7332 * 7333 */ 7334 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 7335 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, 7336 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7337 7338 Label L_first_loop, L_first_loop_exit; 7339 7340 movl(tmp1, len); 7341 shrl(tmp1, 2); 7342 7343 bind(L_first_loop); 7344 subl(tmp1, 1); 7345 jccb(Assembler::negative, L_first_loop_exit); 7346 7347 subl(len, 4); 7348 subl(offset, 4); 7349 7350 Register op2 = tmp2; 7351 const Register sum = tmp3; 7352 const Register op1 = tmp4; 7353 const Register carry = tmp5; 7354 7355 if (UseBMI2Instructions) { 7356 op2 = rdxReg; 7357 } 7358 7359 movq(op1, Address(in, len, Address::times_4, 8)); 7360 rorq(op1, 32); 7361 movq(sum, Address(out, offset, Address::times_4, 8)); 7362 rorq(sum, 32); 7363 if (UseBMI2Instructions) { 7364 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 7365 } 7366 else { 7367 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 7368 } 7369 // Store back in big endian from little endian 7370 rorq(sum, 0x20); 7371 movq(Address(out, offset, Address::times_4, 8), sum); 7372 7373 movq(op1, Address(in, len, Address::times_4, 0)); 7374 rorq(op1, 32); 7375 movq(sum, Address(out, offset, Address::times_4, 0)); 7376 rorq(sum, 32); 7377 if (UseBMI2Instructions) { 7378 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 7379 } 7380 else { 7381 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 7382 } 7383 // Store back in big endian from little endian 7384 rorq(sum, 0x20); 7385 movq(Address(out, offset, Address::times_4, 0), sum); 7386 7387 jmp(L_first_loop); 7388 bind(L_first_loop_exit); 7389 } 7390 7391 /** 7392 * Code for BigInteger::mulAdd() intrinsic 7393 * 7394 * rdi: out 7395 * rsi: in 7396 * r11: offs (out.length - offset) 7397 * rcx: len 7398 * r8: k 7399 * r12: tmp1 7400 * r13: tmp2 7401 * r14: tmp3 7402 * r15: tmp4 7403 * rbx: tmp5 7404 * Multiply the in[] by word k and add to out[], return the carry in rax 7405 */ 7406 void MacroAssembler::mul_add(Register out, Register in, Register offs, 7407 Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 7408 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7409 7410 Label L_carry, L_last_in, L_done; 7411 7412 // carry = 0; 7413 // for (int j=len-1; j >= 0; j--) { 7414 // long product = (in[j] & LONG_MASK) * kLong + 7415 // (out[offs] & LONG_MASK) + carry; 7416 // out[offs--] = (int)product; 7417 // carry = product >>> 32; 7418 // } 7419 // 7420 push(tmp1); 7421 push(tmp2); 7422 push(tmp3); 7423 push(tmp4); 7424 push(tmp5); 7425 7426 Register op2 = tmp2; 7427 const Register sum = tmp3; 7428 const Register op1 = tmp4; 7429 const Register carry = tmp5; 7430 7431 if (UseBMI2Instructions) { 7432 op2 = rdxReg; 7433 movl(op2, k); 7434 } 7435 else { 7436 movl(op2, k); 7437 } 7438 7439 xorq(carry, carry); 7440 7441 //First loop 7442 7443 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply 7444 //The carry is in tmp5 7445 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); 7446 7447 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any 7448 decrementl(len); 7449 jccb(Assembler::negative, L_carry); 7450 decrementl(len); 7451 jccb(Assembler::negative, L_last_in); 7452 7453 movq(op1, Address(in, len, Address::times_4, 0)); 7454 rorq(op1, 32); 7455 7456 subl(offs, 2); 7457 movq(sum, Address(out, offs, Address::times_4, 0)); 7458 rorq(sum, 32); 7459 7460 if (UseBMI2Instructions) { 7461 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 7462 } 7463 else { 7464 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 7465 } 7466 7467 // Store back in big endian from little endian 7468 rorq(sum, 0x20); 7469 movq(Address(out, offs, Address::times_4, 0), sum); 7470 7471 testl(len, len); 7472 jccb(Assembler::zero, L_carry); 7473 7474 //Multiply the last in[] entry, if any 7475 bind(L_last_in); 7476 movl(op1, Address(in, 0)); 7477 movl(sum, Address(out, offs, Address::times_4, -4)); 7478 7479 movl(raxReg, k); 7480 mull(op1); //tmp4 * eax -> edx:eax 7481 addl(sum, carry); 7482 adcl(rdxReg, 0); 7483 addl(sum, raxReg); 7484 adcl(rdxReg, 0); 7485 movl(carry, rdxReg); 7486 7487 movl(Address(out, offs, Address::times_4, -4), sum); 7488 7489 bind(L_carry); 7490 //return tmp5/carry as carry in rax 7491 movl(rax, carry); 7492 7493 bind(L_done); 7494 pop(tmp5); 7495 pop(tmp4); 7496 pop(tmp3); 7497 pop(tmp2); 7498 pop(tmp1); 7499 } 7500 #endif 7501 7502 /** 7503 * Emits code to update CRC-32 with a byte value according to constants in table 7504 * 7505 * @param [in,out]crc Register containing the crc. 7506 * @param [in]val Register containing the byte to fold into the CRC. 7507 * @param [in]table Register containing the table of crc constants. 7508 * 7509 * uint32_t crc; 7510 * val = crc_table[(val ^ crc) & 0xFF]; 7511 * crc = val ^ (crc >> 8); 7512 * 7513 */ 7514 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 7515 xorl(val, crc); 7516 andl(val, 0xFF); 7517 shrl(crc, 8); // unsigned shift 7518 xorl(crc, Address(table, val, Address::times_4, 0)); 7519 } 7520 7521 /** 7522 * Fold 128-bit data chunk 7523 */ 7524 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { 7525 if (UseAVX > 0) { 7526 vpclmulhdq(xtmp, xK, xcrc); // [123:64] 7527 vpclmulldq(xcrc, xK, xcrc); // [63:0] 7528 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */); 7529 pxor(xcrc, xtmp); 7530 } else { 7531 movdqa(xtmp, xcrc); 7532 pclmulhdq(xtmp, xK); // [123:64] 7533 pclmulldq(xcrc, xK); // [63:0] 7534 pxor(xcrc, xtmp); 7535 movdqu(xtmp, Address(buf, offset)); 7536 pxor(xcrc, xtmp); 7537 } 7538 } 7539 7540 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { 7541 if (UseAVX > 0) { 7542 vpclmulhdq(xtmp, xK, xcrc); 7543 vpclmulldq(xcrc, xK, xcrc); 7544 pxor(xcrc, xbuf); 7545 pxor(xcrc, xtmp); 7546 } else { 7547 movdqa(xtmp, xcrc); 7548 pclmulhdq(xtmp, xK); 7549 pclmulldq(xcrc, xK); 7550 pxor(xcrc, xbuf); 7551 pxor(xcrc, xtmp); 7552 } 7553 } 7554 7555 /** 7556 * 8-bit folds to compute 32-bit CRC 7557 * 7558 * uint64_t xcrc; 7559 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); 7560 */ 7561 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { 7562 movdl(tmp, xcrc); 7563 andl(tmp, 0xFF); 7564 movdl(xtmp, Address(table, tmp, Address::times_4, 0)); 7565 psrldq(xcrc, 1); // unsigned shift one byte 7566 pxor(xcrc, xtmp); 7567 } 7568 7569 /** 7570 * uint32_t crc; 7571 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 7572 */ 7573 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 7574 movl(tmp, crc); 7575 andl(tmp, 0xFF); 7576 shrl(crc, 8); 7577 xorl(crc, Address(table, tmp, Address::times_4, 0)); 7578 } 7579 7580 /** 7581 * @param crc register containing existing CRC (32-bit) 7582 * @param buf register pointing to input byte buffer (byte*) 7583 * @param len register containing number of bytes 7584 * @param table register that will contain address of CRC table 7585 * @param tmp scratch register 7586 */ 7587 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { 7588 assert_different_registers(crc, buf, len, table, tmp, rax); 7589 7590 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 7591 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 7592 7593 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 7594 // context for the registers used, where all instructions below are using 128-bit mode 7595 // On EVEX without VL and BW, these instructions will all be AVX. 7596 lea(table, ExternalAddress(StubRoutines::crc_table_addr())); 7597 notl(crc); // ~crc 7598 cmpl(len, 16); 7599 jcc(Assembler::less, L_tail); 7600 7601 // Align buffer to 16 bytes 7602 movl(tmp, buf); 7603 andl(tmp, 0xF); 7604 jccb(Assembler::zero, L_aligned); 7605 subl(tmp, 16); 7606 addl(len, tmp); 7607 7608 align(4); 7609 BIND(L_align_loop); 7610 movsbl(rax, Address(buf, 0)); // load byte with sign extension 7611 update_byte_crc32(crc, rax, table); 7612 increment(buf); 7613 incrementl(tmp); 7614 jccb(Assembler::less, L_align_loop); 7615 7616 BIND(L_aligned); 7617 movl(tmp, len); // save 7618 shrl(len, 4); 7619 jcc(Assembler::zero, L_tail_restore); 7620 7621 // Fold crc into first bytes of vector 7622 movdqa(xmm1, Address(buf, 0)); 7623 movdl(rax, xmm1); 7624 xorl(crc, rax); 7625 if (VM_Version::supports_sse4_1()) { 7626 pinsrd(xmm1, crc, 0); 7627 } else { 7628 pinsrw(xmm1, crc, 0); 7629 shrl(crc, 16); 7630 pinsrw(xmm1, crc, 1); 7631 } 7632 addptr(buf, 16); 7633 subl(len, 4); // len > 0 7634 jcc(Assembler::less, L_fold_tail); 7635 7636 movdqa(xmm2, Address(buf, 0)); 7637 movdqa(xmm3, Address(buf, 16)); 7638 movdqa(xmm4, Address(buf, 32)); 7639 addptr(buf, 48); 7640 subl(len, 3); 7641 jcc(Assembler::lessEqual, L_fold_512b); 7642 7643 // Fold total 512 bits of polynomial on each iteration, 7644 // 128 bits per each of 4 parallel streams. 7645 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1); 7646 7647 align32(); 7648 BIND(L_fold_512b_loop); 7649 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 7650 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); 7651 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); 7652 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); 7653 addptr(buf, 64); 7654 subl(len, 4); 7655 jcc(Assembler::greater, L_fold_512b_loop); 7656 7657 // Fold 512 bits to 128 bits. 7658 BIND(L_fold_512b); 7659 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1); 7660 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); 7661 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); 7662 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); 7663 7664 // Fold the rest of 128 bits data chunks 7665 BIND(L_fold_tail); 7666 addl(len, 3); 7667 jccb(Assembler::lessEqual, L_fold_128b); 7668 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1); 7669 7670 BIND(L_fold_tail_loop); 7671 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 7672 addptr(buf, 16); 7673 decrementl(len); 7674 jccb(Assembler::greater, L_fold_tail_loop); 7675 7676 // Fold 128 bits in xmm1 down into 32 bits in crc register. 7677 BIND(L_fold_128b); 7678 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1); 7679 if (UseAVX > 0) { 7680 vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 7681 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */); 7682 vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 7683 } else { 7684 movdqa(xmm2, xmm0); 7685 pclmulqdq(xmm2, xmm1, 0x1); 7686 movdqa(xmm3, xmm0); 7687 pand(xmm3, xmm2); 7688 pclmulqdq(xmm0, xmm3, 0x1); 7689 } 7690 psrldq(xmm1, 8); 7691 psrldq(xmm2, 4); 7692 pxor(xmm0, xmm1); 7693 pxor(xmm0, xmm2); 7694 7695 // 8 8-bit folds to compute 32-bit CRC. 7696 for (int j = 0; j < 4; j++) { 7697 fold_8bit_crc32(xmm0, table, xmm1, rax); 7698 } 7699 movdl(crc, xmm0); // mov 32 bits to general register 7700 for (int j = 0; j < 4; j++) { 7701 fold_8bit_crc32(crc, table, rax); 7702 } 7703 7704 BIND(L_tail_restore); 7705 movl(len, tmp); // restore 7706 BIND(L_tail); 7707 andl(len, 0xf); 7708 jccb(Assembler::zero, L_exit); 7709 7710 // Fold the rest of bytes 7711 align(4); 7712 BIND(L_tail_loop); 7713 movsbl(rax, Address(buf, 0)); // load byte with sign extension 7714 update_byte_crc32(crc, rax, table); 7715 increment(buf); 7716 decrementl(len); 7717 jccb(Assembler::greater, L_tail_loop); 7718 7719 BIND(L_exit); 7720 notl(crc); // ~c 7721 } 7722 7723 #ifdef _LP64 7724 // Helper function for AVX 512 CRC32 7725 // Fold 512-bit data chunks 7726 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, 7727 Register pos, int offset) { 7728 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit); 7729 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64] 7730 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0] 7731 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */); 7732 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */); 7733 } 7734 7735 // Helper function for AVX 512 CRC32 7736 // Compute CRC32 for < 256B buffers 7737 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos, 7738 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop, 7739 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) { 7740 7741 Label L_less_than_32, L_exact_16_left, L_less_than_16_left; 7742 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left; 7743 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2; 7744 7745 // check if there is enough buffer to be able to fold 16B at a time 7746 cmpl(len, 32); 7747 jcc(Assembler::less, L_less_than_32); 7748 7749 // if there is, load the constants 7750 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10 7751 movdl(xmm0, crc); // get the initial crc value 7752 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 7753 pxor(xmm7, xmm0); 7754 7755 // update the buffer pointer 7756 addl(pos, 16); 7757 //update the counter.subtract 32 instead of 16 to save one instruction from the loop 7758 subl(len, 32); 7759 jmp(L_16B_reduction_loop); 7760 7761 bind(L_less_than_32); 7762 //mov initial crc to the return value. this is necessary for zero - length buffers. 7763 movl(rax, crc); 7764 testl(len, len); 7765 jcc(Assembler::equal, L_cleanup); 7766 7767 movdl(xmm0, crc); //get the initial crc value 7768 7769 cmpl(len, 16); 7770 jcc(Assembler::equal, L_exact_16_left); 7771 jcc(Assembler::less, L_less_than_16_left); 7772 7773 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 7774 pxor(xmm7, xmm0); //xor the initial crc value 7775 addl(pos, 16); 7776 subl(len, 16); 7777 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10 7778 jmp(L_get_last_two_xmms); 7779 7780 bind(L_less_than_16_left); 7781 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first. 7782 pxor(xmm1, xmm1); 7783 movptr(tmp1, rsp); 7784 movdqu(Address(tmp1, 0 * 16), xmm1); 7785 7786 cmpl(len, 4); 7787 jcc(Assembler::less, L_only_less_than_4); 7788 7789 //backup the counter value 7790 movl(tmp2, len); 7791 cmpl(len, 8); 7792 jcc(Assembler::less, L_less_than_8_left); 7793 7794 //load 8 Bytes 7795 movq(rax, Address(buf, pos, Address::times_1, 0 * 16)); 7796 movq(Address(tmp1, 0 * 16), rax); 7797 addptr(tmp1, 8); 7798 subl(len, 8); 7799 addl(pos, 8); 7800 7801 bind(L_less_than_8_left); 7802 cmpl(len, 4); 7803 jcc(Assembler::less, L_less_than_4_left); 7804 7805 //load 4 Bytes 7806 movl(rax, Address(buf, pos, Address::times_1, 0)); 7807 movl(Address(tmp1, 0 * 16), rax); 7808 addptr(tmp1, 4); 7809 subl(len, 4); 7810 addl(pos, 4); 7811 7812 bind(L_less_than_4_left); 7813 cmpl(len, 2); 7814 jcc(Assembler::less, L_less_than_2_left); 7815 7816 // load 2 Bytes 7817 movw(rax, Address(buf, pos, Address::times_1, 0)); 7818 movl(Address(tmp1, 0 * 16), rax); 7819 addptr(tmp1, 2); 7820 subl(len, 2); 7821 addl(pos, 2); 7822 7823 bind(L_less_than_2_left); 7824 cmpl(len, 1); 7825 jcc(Assembler::less, L_zero_left); 7826 7827 // load 1 Byte 7828 movb(rax, Address(buf, pos, Address::times_1, 0)); 7829 movb(Address(tmp1, 0 * 16), rax); 7830 7831 bind(L_zero_left); 7832 movdqu(xmm7, Address(rsp, 0)); 7833 pxor(xmm7, xmm0); //xor the initial crc value 7834 7835 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 7836 movdqu(xmm0, Address(rax, tmp2)); 7837 pshufb(xmm7, xmm0); 7838 jmp(L_128_done); 7839 7840 bind(L_exact_16_left); 7841 movdqu(xmm7, Address(buf, pos, Address::times_1, 0)); 7842 pxor(xmm7, xmm0); //xor the initial crc value 7843 jmp(L_128_done); 7844 7845 bind(L_only_less_than_4); 7846 cmpl(len, 3); 7847 jcc(Assembler::less, L_only_less_than_3); 7848 7849 // load 3 Bytes 7850 movb(rax, Address(buf, pos, Address::times_1, 0)); 7851 movb(Address(tmp1, 0), rax); 7852 7853 movb(rax, Address(buf, pos, Address::times_1, 1)); 7854 movb(Address(tmp1, 1), rax); 7855 7856 movb(rax, Address(buf, pos, Address::times_1, 2)); 7857 movb(Address(tmp1, 2), rax); 7858 7859 movdqu(xmm7, Address(rsp, 0)); 7860 pxor(xmm7, xmm0); //xor the initial crc value 7861 7862 pslldq(xmm7, 0x5); 7863 jmp(L_barrett); 7864 bind(L_only_less_than_3); 7865 cmpl(len, 2); 7866 jcc(Assembler::less, L_only_less_than_2); 7867 7868 // load 2 Bytes 7869 movb(rax, Address(buf, pos, Address::times_1, 0)); 7870 movb(Address(tmp1, 0), rax); 7871 7872 movb(rax, Address(buf, pos, Address::times_1, 1)); 7873 movb(Address(tmp1, 1), rax); 7874 7875 movdqu(xmm7, Address(rsp, 0)); 7876 pxor(xmm7, xmm0); //xor the initial crc value 7877 7878 pslldq(xmm7, 0x6); 7879 jmp(L_barrett); 7880 7881 bind(L_only_less_than_2); 7882 //load 1 Byte 7883 movb(rax, Address(buf, pos, Address::times_1, 0)); 7884 movb(Address(tmp1, 0), rax); 7885 7886 movdqu(xmm7, Address(rsp, 0)); 7887 pxor(xmm7, xmm0); //xor the initial crc value 7888 7889 pslldq(xmm7, 0x7); 7890 } 7891 7892 /** 7893 * Compute CRC32 using AVX512 instructions 7894 * param crc register containing existing CRC (32-bit) 7895 * param buf register pointing to input byte buffer (byte*) 7896 * param len register containing number of bytes 7897 * param table address of crc or crc32c table 7898 * param tmp1 scratch register 7899 * param tmp2 scratch register 7900 * return rax result register 7901 * 7902 * This routine is identical for crc32c with the exception of the precomputed constant 7903 * table which will be passed as the table argument. The calculation steps are 7904 * the same for both variants. 7905 */ 7906 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) { 7907 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12); 7908 7909 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 7910 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 7911 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop; 7912 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop; 7913 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup; 7914 7915 const Register pos = r12; 7916 push(r12); 7917 subptr(rsp, 16 * 2 + 8); 7918 7919 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 7920 // context for the registers used, where all instructions below are using 128-bit mode 7921 // On EVEX without VL and BW, these instructions will all be AVX. 7922 movl(pos, 0); 7923 7924 // check if smaller than 256B 7925 cmpl(len, 256); 7926 jcc(Assembler::less, L_less_than_256); 7927 7928 // load the initial crc value 7929 movdl(xmm10, crc); 7930 7931 // receive the initial 64B data, xor the initial crc value 7932 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); 7933 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); 7934 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit); 7935 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4 7936 7937 subl(len, 256); 7938 cmpl(len, 256); 7939 jcc(Assembler::less, L_fold_128_B_loop); 7940 7941 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); 7942 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); 7943 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2 7944 subl(len, 256); 7945 7946 bind(L_fold_256_B_loop); 7947 addl(pos, 256); 7948 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64); 7949 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64); 7950 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64); 7951 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64); 7952 7953 subl(len, 256); 7954 jcc(Assembler::greaterEqual, L_fold_256_B_loop); 7955 7956 // Fold 256 into 128 7957 addl(pos, 256); 7958 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit); 7959 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit); 7960 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC 7961 7962 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit); 7963 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit); 7964 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC 7965 7966 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit); 7967 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit); 7968 7969 addl(len, 128); 7970 jmp(L_fold_128_B_register); 7971 7972 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop 7973 // loop will fold 128B at a time until we have 128 + y Bytes of buffer 7974 7975 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel 7976 bind(L_fold_128_B_loop); 7977 addl(pos, 128); 7978 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64); 7979 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64); 7980 7981 subl(len, 128); 7982 jcc(Assembler::greaterEqual, L_fold_128_B_loop); 7983 7984 addl(pos, 128); 7985 7986 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 7987 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 7988 bind(L_fold_128_B_register); 7989 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16 7990 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0 7991 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit); 7992 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit); 7993 // save last that has no multiplicand 7994 vextracti64x2(xmm7, xmm4, 3); 7995 7996 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit); 7997 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit); 7998 // Needed later in reduction loop 7999 movdqu(xmm10, Address(table, 1 * 16)); 8000 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC 8001 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC 8002 8003 // Swap 1,0,3,2 - 01 00 11 10 8004 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit); 8005 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit); 8006 vextracti128(xmm5, xmm8, 1); 8007 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit); 8008 8009 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop 8010 // instead of a cmp instruction, we use the negative flag with the jl instruction 8011 addl(len, 128 - 16); 8012 jcc(Assembler::less, L_final_reduction_for_128); 8013 8014 bind(L_16B_reduction_loop); 8015 vpclmulqdq(xmm8, xmm7, xmm10, 0x01); 8016 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 8017 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 8018 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16)); 8019 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8020 addl(pos, 16); 8021 subl(len, 16); 8022 jcc(Assembler::greaterEqual, L_16B_reduction_loop); 8023 8024 bind(L_final_reduction_for_128); 8025 addl(len, 16); 8026 jcc(Assembler::equal, L_128_done); 8027 8028 bind(L_get_last_two_xmms); 8029 movdqu(xmm2, xmm7); 8030 addl(pos, len); 8031 movdqu(xmm1, Address(buf, pos, Address::times_1, -16)); 8032 subl(pos, len); 8033 8034 // get rid of the extra data that was loaded before 8035 // load the shift constant 8036 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 8037 movdqu(xmm0, Address(rax, len)); 8038 addl(rax, len); 8039 8040 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8041 //Change mask to 512 8042 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2); 8043 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit); 8044 8045 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit); 8046 vpclmulqdq(xmm8, xmm7, xmm10, 0x01); 8047 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 8048 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 8049 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit); 8050 8051 bind(L_128_done); 8052 // compute crc of a 128-bit value 8053 movdqu(xmm10, Address(table, 3 * 16)); 8054 movdqu(xmm0, xmm7); 8055 8056 // 64b fold 8057 vpclmulqdq(xmm7, xmm7, xmm10, 0x0); 8058 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit); 8059 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8060 8061 // 32b fold 8062 movdqu(xmm0, xmm7); 8063 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit); 8064 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 8065 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8066 jmp(L_barrett); 8067 8068 bind(L_less_than_256); 8069 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup); 8070 8071 //barrett reduction 8072 bind(L_barrett); 8073 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2); 8074 movdqu(xmm1, xmm7); 8075 movdqu(xmm2, xmm7); 8076 movdqu(xmm10, Address(table, 4 * 16)); 8077 8078 pclmulqdq(xmm7, xmm10, 0x0); 8079 pxor(xmm7, xmm2); 8080 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2); 8081 movdqu(xmm2, xmm7); 8082 pclmulqdq(xmm7, xmm10, 0x10); 8083 pxor(xmm7, xmm2); 8084 pxor(xmm7, xmm1); 8085 pextrd(crc, xmm7, 2); 8086 8087 bind(L_cleanup); 8088 addptr(rsp, 16 * 2 + 8); 8089 pop(r12); 8090 } 8091 8092 // S. Gueron / Information Processing Letters 112 (2012) 184 8093 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. 8094 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. 8095 // Output: the 64-bit carry-less product of B * CONST 8096 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n, 8097 Register tmp1, Register tmp2, Register tmp3) { 8098 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 8099 if (n > 0) { 8100 addq(tmp3, n * 256 * 8); 8101 } 8102 // Q1 = TABLEExt[n][B & 0xFF]; 8103 movl(tmp1, in); 8104 andl(tmp1, 0x000000FF); 8105 shll(tmp1, 3); 8106 addq(tmp1, tmp3); 8107 movq(tmp1, Address(tmp1, 0)); 8108 8109 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 8110 movl(tmp2, in); 8111 shrl(tmp2, 8); 8112 andl(tmp2, 0x000000FF); 8113 shll(tmp2, 3); 8114 addq(tmp2, tmp3); 8115 movq(tmp2, Address(tmp2, 0)); 8116 8117 shlq(tmp2, 8); 8118 xorq(tmp1, tmp2); 8119 8120 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 8121 movl(tmp2, in); 8122 shrl(tmp2, 16); 8123 andl(tmp2, 0x000000FF); 8124 shll(tmp2, 3); 8125 addq(tmp2, tmp3); 8126 movq(tmp2, Address(tmp2, 0)); 8127 8128 shlq(tmp2, 16); 8129 xorq(tmp1, tmp2); 8130 8131 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 8132 shrl(in, 24); 8133 andl(in, 0x000000FF); 8134 shll(in, 3); 8135 addq(in, tmp3); 8136 movq(in, Address(in, 0)); 8137 8138 shlq(in, 24); 8139 xorq(in, tmp1); 8140 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 8141 } 8142 8143 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 8144 Register in_out, 8145 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 8146 XMMRegister w_xtmp2, 8147 Register tmp1, 8148 Register n_tmp2, Register n_tmp3) { 8149 if (is_pclmulqdq_supported) { 8150 movdl(w_xtmp1, in_out); // modified blindly 8151 8152 movl(tmp1, const_or_pre_comp_const_index); 8153 movdl(w_xtmp2, tmp1); 8154 pclmulqdq(w_xtmp1, w_xtmp2, 0); 8155 8156 movdq(in_out, w_xtmp1); 8157 } else { 8158 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3); 8159 } 8160 } 8161 8162 // Recombination Alternative 2: No bit-reflections 8163 // T1 = (CRC_A * U1) << 1 8164 // T2 = (CRC_B * U2) << 1 8165 // C1 = T1 >> 32 8166 // C2 = T2 >> 32 8167 // T1 = T1 & 0xFFFFFFFF 8168 // T2 = T2 & 0xFFFFFFFF 8169 // T1 = CRC32(0, T1) 8170 // T2 = CRC32(0, T2) 8171 // C1 = C1 ^ T1 8172 // C2 = C2 ^ T2 8173 // CRC = C1 ^ C2 ^ CRC_C 8174 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 8175 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8176 Register tmp1, Register tmp2, 8177 Register n_tmp3) { 8178 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8179 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8180 shlq(in_out, 1); 8181 movl(tmp1, in_out); 8182 shrq(in_out, 32); 8183 xorl(tmp2, tmp2); 8184 crc32(tmp2, tmp1, 4); 8185 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here 8186 shlq(in1, 1); 8187 movl(tmp1, in1); 8188 shrq(in1, 32); 8189 xorl(tmp2, tmp2); 8190 crc32(tmp2, tmp1, 4); 8191 xorl(in1, tmp2); 8192 xorl(in_out, in1); 8193 xorl(in_out, in2); 8194 } 8195 8196 // Set N to predefined value 8197 // Subtract from a length of a buffer 8198 // execute in a loop: 8199 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 8200 // for i = 1 to N do 8201 // CRC_A = CRC32(CRC_A, A[i]) 8202 // CRC_B = CRC32(CRC_B, B[i]) 8203 // CRC_C = CRC32(CRC_C, C[i]) 8204 // end for 8205 // Recombine 8206 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 8207 Register in_out1, Register in_out2, Register in_out3, 8208 Register tmp1, Register tmp2, Register tmp3, 8209 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8210 Register tmp4, Register tmp5, 8211 Register n_tmp6) { 8212 Label L_processPartitions; 8213 Label L_processPartition; 8214 Label L_exit; 8215 8216 bind(L_processPartitions); 8217 cmpl(in_out1, 3 * size); 8218 jcc(Assembler::less, L_exit); 8219 xorl(tmp1, tmp1); 8220 xorl(tmp2, tmp2); 8221 movq(tmp3, in_out2); 8222 addq(tmp3, size); 8223 8224 bind(L_processPartition); 8225 crc32(in_out3, Address(in_out2, 0), 8); 8226 crc32(tmp1, Address(in_out2, size), 8); 8227 crc32(tmp2, Address(in_out2, size * 2), 8); 8228 addq(in_out2, 8); 8229 cmpq(in_out2, tmp3); 8230 jcc(Assembler::less, L_processPartition); 8231 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 8232 w_xtmp1, w_xtmp2, w_xtmp3, 8233 tmp4, tmp5, 8234 n_tmp6); 8235 addq(in_out2, 2 * size); 8236 subl(in_out1, 3 * size); 8237 jmp(L_processPartitions); 8238 8239 bind(L_exit); 8240 } 8241 #else 8242 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n, 8243 Register tmp1, Register tmp2, Register tmp3, 8244 XMMRegister xtmp1, XMMRegister xtmp2) { 8245 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 8246 if (n > 0) { 8247 addl(tmp3, n * 256 * 8); 8248 } 8249 // Q1 = TABLEExt[n][B & 0xFF]; 8250 movl(tmp1, in_out); 8251 andl(tmp1, 0x000000FF); 8252 shll(tmp1, 3); 8253 addl(tmp1, tmp3); 8254 movq(xtmp1, Address(tmp1, 0)); 8255 8256 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 8257 movl(tmp2, in_out); 8258 shrl(tmp2, 8); 8259 andl(tmp2, 0x000000FF); 8260 shll(tmp2, 3); 8261 addl(tmp2, tmp3); 8262 movq(xtmp2, Address(tmp2, 0)); 8263 8264 psllq(xtmp2, 8); 8265 pxor(xtmp1, xtmp2); 8266 8267 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 8268 movl(tmp2, in_out); 8269 shrl(tmp2, 16); 8270 andl(tmp2, 0x000000FF); 8271 shll(tmp2, 3); 8272 addl(tmp2, tmp3); 8273 movq(xtmp2, Address(tmp2, 0)); 8274 8275 psllq(xtmp2, 16); 8276 pxor(xtmp1, xtmp2); 8277 8278 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 8279 shrl(in_out, 24); 8280 andl(in_out, 0x000000FF); 8281 shll(in_out, 3); 8282 addl(in_out, tmp3); 8283 movq(xtmp2, Address(in_out, 0)); 8284 8285 psllq(xtmp2, 24); 8286 pxor(xtmp1, xtmp2); // Result in CXMM 8287 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 8288 } 8289 8290 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 8291 Register in_out, 8292 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 8293 XMMRegister w_xtmp2, 8294 Register tmp1, 8295 Register n_tmp2, Register n_tmp3) { 8296 if (is_pclmulqdq_supported) { 8297 movdl(w_xtmp1, in_out); 8298 8299 movl(tmp1, const_or_pre_comp_const_index); 8300 movdl(w_xtmp2, tmp1); 8301 pclmulqdq(w_xtmp1, w_xtmp2, 0); 8302 // Keep result in XMM since GPR is 32 bit in length 8303 } else { 8304 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2); 8305 } 8306 } 8307 8308 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 8309 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8310 Register tmp1, Register tmp2, 8311 Register n_tmp3) { 8312 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8313 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8314 8315 psllq(w_xtmp1, 1); 8316 movdl(tmp1, w_xtmp1); 8317 psrlq(w_xtmp1, 32); 8318 movdl(in_out, w_xtmp1); 8319 8320 xorl(tmp2, tmp2); 8321 crc32(tmp2, tmp1, 4); 8322 xorl(in_out, tmp2); 8323 8324 psllq(w_xtmp2, 1); 8325 movdl(tmp1, w_xtmp2); 8326 psrlq(w_xtmp2, 32); 8327 movdl(in1, w_xtmp2); 8328 8329 xorl(tmp2, tmp2); 8330 crc32(tmp2, tmp1, 4); 8331 xorl(in1, tmp2); 8332 xorl(in_out, in1); 8333 xorl(in_out, in2); 8334 } 8335 8336 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 8337 Register in_out1, Register in_out2, Register in_out3, 8338 Register tmp1, Register tmp2, Register tmp3, 8339 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8340 Register tmp4, Register tmp5, 8341 Register n_tmp6) { 8342 Label L_processPartitions; 8343 Label L_processPartition; 8344 Label L_exit; 8345 8346 bind(L_processPartitions); 8347 cmpl(in_out1, 3 * size); 8348 jcc(Assembler::less, L_exit); 8349 xorl(tmp1, tmp1); 8350 xorl(tmp2, tmp2); 8351 movl(tmp3, in_out2); 8352 addl(tmp3, size); 8353 8354 bind(L_processPartition); 8355 crc32(in_out3, Address(in_out2, 0), 4); 8356 crc32(tmp1, Address(in_out2, size), 4); 8357 crc32(tmp2, Address(in_out2, size*2), 4); 8358 crc32(in_out3, Address(in_out2, 0+4), 4); 8359 crc32(tmp1, Address(in_out2, size+4), 4); 8360 crc32(tmp2, Address(in_out2, size*2+4), 4); 8361 addl(in_out2, 8); 8362 cmpl(in_out2, tmp3); 8363 jcc(Assembler::less, L_processPartition); 8364 8365 push(tmp3); 8366 push(in_out1); 8367 push(in_out2); 8368 tmp4 = tmp3; 8369 tmp5 = in_out1; 8370 n_tmp6 = in_out2; 8371 8372 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 8373 w_xtmp1, w_xtmp2, w_xtmp3, 8374 tmp4, tmp5, 8375 n_tmp6); 8376 8377 pop(in_out2); 8378 pop(in_out1); 8379 pop(tmp3); 8380 8381 addl(in_out2, 2 * size); 8382 subl(in_out1, 3 * size); 8383 jmp(L_processPartitions); 8384 8385 bind(L_exit); 8386 } 8387 #endif //LP64 8388 8389 #ifdef _LP64 8390 // Algorithm 2: Pipelined usage of the CRC32 instruction. 8391 // Input: A buffer I of L bytes. 8392 // Output: the CRC32C value of the buffer. 8393 // Notations: 8394 // Write L = 24N + r, with N = floor (L/24). 8395 // r = L mod 24 (0 <= r < 24). 8396 // Consider I as the concatenation of A|B|C|R, where A, B, C, each, 8397 // N quadwords, and R consists of r bytes. 8398 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 8399 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 8400 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 8401 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 8402 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 8403 Register tmp1, Register tmp2, Register tmp3, 8404 Register tmp4, Register tmp5, Register tmp6, 8405 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8406 bool is_pclmulqdq_supported) { 8407 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 8408 Label L_wordByWord; 8409 Label L_byteByByteProlog; 8410 Label L_byteByByte; 8411 Label L_exit; 8412 8413 if (is_pclmulqdq_supported ) { 8414 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; 8415 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1); 8416 8417 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); 8418 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); 8419 8420 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); 8421 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); 8422 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); 8423 } else { 8424 const_or_pre_comp_const_index[0] = 1; 8425 const_or_pre_comp_const_index[1] = 0; 8426 8427 const_or_pre_comp_const_index[2] = 3; 8428 const_or_pre_comp_const_index[3] = 2; 8429 8430 const_or_pre_comp_const_index[4] = 5; 8431 const_or_pre_comp_const_index[5] = 4; 8432 } 8433 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 8434 in2, in1, in_out, 8435 tmp1, tmp2, tmp3, 8436 w_xtmp1, w_xtmp2, w_xtmp3, 8437 tmp4, tmp5, 8438 tmp6); 8439 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 8440 in2, in1, in_out, 8441 tmp1, tmp2, tmp3, 8442 w_xtmp1, w_xtmp2, w_xtmp3, 8443 tmp4, tmp5, 8444 tmp6); 8445 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 8446 in2, in1, in_out, 8447 tmp1, tmp2, tmp3, 8448 w_xtmp1, w_xtmp2, w_xtmp3, 8449 tmp4, tmp5, 8450 tmp6); 8451 movl(tmp1, in2); 8452 andl(tmp1, 0x00000007); 8453 negl(tmp1); 8454 addl(tmp1, in2); 8455 addq(tmp1, in1); 8456 8457 cmpq(in1, tmp1); 8458 jccb(Assembler::greaterEqual, L_byteByByteProlog); 8459 align(16); 8460 BIND(L_wordByWord); 8461 crc32(in_out, Address(in1, 0), 8); 8462 addq(in1, 8); 8463 cmpq(in1, tmp1); 8464 jcc(Assembler::less, L_wordByWord); 8465 8466 BIND(L_byteByByteProlog); 8467 andl(in2, 0x00000007); 8468 movl(tmp2, 1); 8469 8470 cmpl(tmp2, in2); 8471 jccb(Assembler::greater, L_exit); 8472 BIND(L_byteByByte); 8473 crc32(in_out, Address(in1, 0), 1); 8474 incq(in1); 8475 incl(tmp2); 8476 cmpl(tmp2, in2); 8477 jcc(Assembler::lessEqual, L_byteByByte); 8478 8479 BIND(L_exit); 8480 } 8481 #else 8482 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 8483 Register tmp1, Register tmp2, Register tmp3, 8484 Register tmp4, Register tmp5, Register tmp6, 8485 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8486 bool is_pclmulqdq_supported) { 8487 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 8488 Label L_wordByWord; 8489 Label L_byteByByteProlog; 8490 Label L_byteByByte; 8491 Label L_exit; 8492 8493 if (is_pclmulqdq_supported) { 8494 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; 8495 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1); 8496 8497 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); 8498 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); 8499 8500 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); 8501 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); 8502 } else { 8503 const_or_pre_comp_const_index[0] = 1; 8504 const_or_pre_comp_const_index[1] = 0; 8505 8506 const_or_pre_comp_const_index[2] = 3; 8507 const_or_pre_comp_const_index[3] = 2; 8508 8509 const_or_pre_comp_const_index[4] = 5; 8510 const_or_pre_comp_const_index[5] = 4; 8511 } 8512 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 8513 in2, in1, in_out, 8514 tmp1, tmp2, tmp3, 8515 w_xtmp1, w_xtmp2, w_xtmp3, 8516 tmp4, tmp5, 8517 tmp6); 8518 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 8519 in2, in1, in_out, 8520 tmp1, tmp2, tmp3, 8521 w_xtmp1, w_xtmp2, w_xtmp3, 8522 tmp4, tmp5, 8523 tmp6); 8524 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 8525 in2, in1, in_out, 8526 tmp1, tmp2, tmp3, 8527 w_xtmp1, w_xtmp2, w_xtmp3, 8528 tmp4, tmp5, 8529 tmp6); 8530 movl(tmp1, in2); 8531 andl(tmp1, 0x00000007); 8532 negl(tmp1); 8533 addl(tmp1, in2); 8534 addl(tmp1, in1); 8535 8536 BIND(L_wordByWord); 8537 cmpl(in1, tmp1); 8538 jcc(Assembler::greaterEqual, L_byteByByteProlog); 8539 crc32(in_out, Address(in1,0), 4); 8540 addl(in1, 4); 8541 jmp(L_wordByWord); 8542 8543 BIND(L_byteByByteProlog); 8544 andl(in2, 0x00000007); 8545 movl(tmp2, 1); 8546 8547 BIND(L_byteByByte); 8548 cmpl(tmp2, in2); 8549 jccb(Assembler::greater, L_exit); 8550 movb(tmp1, Address(in1, 0)); 8551 crc32(in_out, tmp1, 1); 8552 incl(in1); 8553 incl(tmp2); 8554 jmp(L_byteByByte); 8555 8556 BIND(L_exit); 8557 } 8558 #endif // LP64 8559 #undef BIND 8560 #undef BLOCK_COMMENT 8561 8562 // Compress char[] array to byte[]. 8563 // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java 8564 // @IntrinsicCandidate 8565 // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { 8566 // for (int i = 0; i < len; i++) { 8567 // int c = src[srcOff++]; 8568 // if (c >>> 8 != 0) { 8569 // return 0; 8570 // } 8571 // dst[dstOff++] = (byte)c; 8572 // } 8573 // return len; 8574 // } 8575 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 8576 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 8577 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 8578 Register tmp5, Register result, KRegister mask1, KRegister mask2) { 8579 Label copy_chars_loop, return_length, return_zero, done; 8580 8581 // rsi: src 8582 // rdi: dst 8583 // rdx: len 8584 // rcx: tmp5 8585 // rax: result 8586 8587 // rsi holds start addr of source char[] to be compressed 8588 // rdi holds start addr of destination byte[] 8589 // rdx holds length 8590 8591 assert(len != result, ""); 8592 8593 // save length for return 8594 push(len); 8595 8596 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 8597 VM_Version::supports_avx512vlbw() && 8598 VM_Version::supports_bmi2()) { 8599 8600 Label copy_32_loop, copy_loop_tail, below_threshold; 8601 8602 // alignment 8603 Label post_alignment; 8604 8605 // if length of the string is less than 16, handle it in an old fashioned way 8606 testl(len, -32); 8607 jcc(Assembler::zero, below_threshold); 8608 8609 // First check whether a character is compressible ( <= 0xFF). 8610 // Create mask to test for Unicode chars inside zmm vector 8611 movl(result, 0x00FF); 8612 evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit); 8613 8614 testl(len, -64); 8615 jcc(Assembler::zero, post_alignment); 8616 8617 movl(tmp5, dst); 8618 andl(tmp5, (32 - 1)); 8619 negl(tmp5); 8620 andl(tmp5, (32 - 1)); 8621 8622 // bail out when there is nothing to be done 8623 testl(tmp5, 0xFFFFFFFF); 8624 jcc(Assembler::zero, post_alignment); 8625 8626 // ~(~0 << len), where len is the # of remaining elements to process 8627 movl(result, 0xFFFFFFFF); 8628 shlxl(result, result, tmp5); 8629 notl(result); 8630 kmovdl(mask2, result); 8631 8632 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); 8633 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); 8634 ktestd(mask1, mask2); 8635 jcc(Assembler::carryClear, return_zero); 8636 8637 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); 8638 8639 addptr(src, tmp5); 8640 addptr(src, tmp5); 8641 addptr(dst, tmp5); 8642 subl(len, tmp5); 8643 8644 bind(post_alignment); 8645 // end of alignment 8646 8647 movl(tmp5, len); 8648 andl(tmp5, (32 - 1)); // tail count (in chars) 8649 andl(len, ~(32 - 1)); // vector count (in chars) 8650 jcc(Assembler::zero, copy_loop_tail); 8651 8652 lea(src, Address(src, len, Address::times_2)); 8653 lea(dst, Address(dst, len, Address::times_1)); 8654 negptr(len); 8655 8656 bind(copy_32_loop); 8657 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); 8658 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); 8659 kortestdl(mask1, mask1); 8660 jcc(Assembler::carryClear, return_zero); 8661 8662 // All elements in current processed chunk are valid candidates for 8663 // compression. Write a truncated byte elements to the memory. 8664 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); 8665 addptr(len, 32); 8666 jcc(Assembler::notZero, copy_32_loop); 8667 8668 bind(copy_loop_tail); 8669 // bail out when there is nothing to be done 8670 testl(tmp5, 0xFFFFFFFF); 8671 jcc(Assembler::zero, return_length); 8672 8673 movl(len, tmp5); 8674 8675 // ~(~0 << len), where len is the # of remaining elements to process 8676 movl(result, 0xFFFFFFFF); 8677 shlxl(result, result, len); 8678 notl(result); 8679 8680 kmovdl(mask2, result); 8681 8682 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); 8683 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); 8684 ktestd(mask1, mask2); 8685 jcc(Assembler::carryClear, return_zero); 8686 8687 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); 8688 jmp(return_length); 8689 8690 bind(below_threshold); 8691 } 8692 8693 if (UseSSE42Intrinsics) { 8694 Label copy_32_loop, copy_16, copy_tail; 8695 8696 movl(result, len); 8697 8698 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors 8699 8700 // vectored compression 8701 andl(len, 0xfffffff0); // vector count (in chars) 8702 andl(result, 0x0000000f); // tail count (in chars) 8703 testl(len, len); 8704 jcc(Assembler::zero, copy_16); 8705 8706 // compress 16 chars per iter 8707 movdl(tmp1Reg, tmp5); 8708 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg 8709 pxor(tmp4Reg, tmp4Reg); 8710 8711 lea(src, Address(src, len, Address::times_2)); 8712 lea(dst, Address(dst, len, Address::times_1)); 8713 negptr(len); 8714 8715 bind(copy_32_loop); 8716 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters 8717 por(tmp4Reg, tmp2Reg); 8718 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters 8719 por(tmp4Reg, tmp3Reg); 8720 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector 8721 jcc(Assembler::notZero, return_zero); 8722 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte 8723 movdqu(Address(dst, len, Address::times_1), tmp2Reg); 8724 addptr(len, 16); 8725 jcc(Assembler::notZero, copy_32_loop); 8726 8727 // compress next vector of 8 chars (if any) 8728 bind(copy_16); 8729 movl(len, result); 8730 andl(len, 0xfffffff8); // vector count (in chars) 8731 andl(result, 0x00000007); // tail count (in chars) 8732 testl(len, len); 8733 jccb(Assembler::zero, copy_tail); 8734 8735 movdl(tmp1Reg, tmp5); 8736 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg 8737 pxor(tmp3Reg, tmp3Reg); 8738 8739 movdqu(tmp2Reg, Address(src, 0)); 8740 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector 8741 jccb(Assembler::notZero, return_zero); 8742 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte 8743 movq(Address(dst, 0), tmp2Reg); 8744 addptr(src, 16); 8745 addptr(dst, 8); 8746 8747 bind(copy_tail); 8748 movl(len, result); 8749 } 8750 // compress 1 char per iter 8751 testl(len, len); 8752 jccb(Assembler::zero, return_length); 8753 lea(src, Address(src, len, Address::times_2)); 8754 lea(dst, Address(dst, len, Address::times_1)); 8755 negptr(len); 8756 8757 bind(copy_chars_loop); 8758 load_unsigned_short(result, Address(src, len, Address::times_2)); 8759 testl(result, 0xff00); // check if Unicode char 8760 jccb(Assembler::notZero, return_zero); 8761 movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte 8762 increment(len); 8763 jcc(Assembler::notZero, copy_chars_loop); 8764 8765 // if compression succeeded, return length 8766 bind(return_length); 8767 pop(result); 8768 jmpb(done); 8769 8770 // if compression failed, return 0 8771 bind(return_zero); 8772 xorl(result, result); 8773 addptr(rsp, wordSize); 8774 8775 bind(done); 8776 } 8777 8778 // Inflate byte[] array to char[]. 8779 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java 8780 // @IntrinsicCandidate 8781 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) { 8782 // for (int i = 0; i < len; i++) { 8783 // dst[dstOff++] = (char)(src[srcOff++] & 0xff); 8784 // } 8785 // } 8786 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 8787 XMMRegister tmp1, Register tmp2, KRegister mask) { 8788 Label copy_chars_loop, done, below_threshold, avx3_threshold; 8789 // rsi: src 8790 // rdi: dst 8791 // rdx: len 8792 // rcx: tmp2 8793 8794 // rsi holds start addr of source byte[] to be inflated 8795 // rdi holds start addr of destination char[] 8796 // rdx holds length 8797 assert_different_registers(src, dst, len, tmp2); 8798 movl(tmp2, len); 8799 if ((UseAVX > 2) && // AVX512 8800 VM_Version::supports_avx512vlbw() && 8801 VM_Version::supports_bmi2()) { 8802 8803 Label copy_32_loop, copy_tail; 8804 Register tmp3_aliased = len; 8805 8806 // if length of the string is less than 16, handle it in an old fashioned way 8807 testl(len, -16); 8808 jcc(Assembler::zero, below_threshold); 8809 8810 testl(len, -1 * AVX3Threshold); 8811 jcc(Assembler::zero, avx3_threshold); 8812 8813 // In order to use only one arithmetic operation for the main loop we use 8814 // this pre-calculation 8815 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop 8816 andl(len, -32); // vector count 8817 jccb(Assembler::zero, copy_tail); 8818 8819 lea(src, Address(src, len, Address::times_1)); 8820 lea(dst, Address(dst, len, Address::times_2)); 8821 negptr(len); 8822 8823 8824 // inflate 32 chars per iter 8825 bind(copy_32_loop); 8826 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); 8827 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit); 8828 addptr(len, 32); 8829 jcc(Assembler::notZero, copy_32_loop); 8830 8831 bind(copy_tail); 8832 // bail out when there is nothing to be done 8833 testl(tmp2, -1); // we don't destroy the contents of tmp2 here 8834 jcc(Assembler::zero, done); 8835 8836 // ~(~0 << length), where length is the # of remaining elements to process 8837 movl(tmp3_aliased, -1); 8838 shlxl(tmp3_aliased, tmp3_aliased, tmp2); 8839 notl(tmp3_aliased); 8840 kmovdl(mask, tmp3_aliased); 8841 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit); 8842 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit); 8843 8844 jmp(done); 8845 bind(avx3_threshold); 8846 } 8847 if (UseSSE42Intrinsics) { 8848 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; 8849 8850 if (UseAVX > 1) { 8851 andl(tmp2, (16 - 1)); 8852 andl(len, -16); 8853 jccb(Assembler::zero, copy_new_tail); 8854 } else { 8855 andl(tmp2, 0x00000007); // tail count (in chars) 8856 andl(len, 0xfffffff8); // vector count (in chars) 8857 jccb(Assembler::zero, copy_tail); 8858 } 8859 8860 // vectored inflation 8861 lea(src, Address(src, len, Address::times_1)); 8862 lea(dst, Address(dst, len, Address::times_2)); 8863 negptr(len); 8864 8865 if (UseAVX > 1) { 8866 bind(copy_16_loop); 8867 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit); 8868 vmovdqu(Address(dst, len, Address::times_2), tmp1); 8869 addptr(len, 16); 8870 jcc(Assembler::notZero, copy_16_loop); 8871 8872 bind(below_threshold); 8873 bind(copy_new_tail); 8874 movl(len, tmp2); 8875 andl(tmp2, 0x00000007); 8876 andl(len, 0xFFFFFFF8); 8877 jccb(Assembler::zero, copy_tail); 8878 8879 pmovzxbw(tmp1, Address(src, 0)); 8880 movdqu(Address(dst, 0), tmp1); 8881 addptr(src, 8); 8882 addptr(dst, 2 * 8); 8883 8884 jmp(copy_tail, true); 8885 } 8886 8887 // inflate 8 chars per iter 8888 bind(copy_8_loop); 8889 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words 8890 movdqu(Address(dst, len, Address::times_2), tmp1); 8891 addptr(len, 8); 8892 jcc(Assembler::notZero, copy_8_loop); 8893 8894 bind(copy_tail); 8895 movl(len, tmp2); 8896 8897 cmpl(len, 4); 8898 jccb(Assembler::less, copy_bytes); 8899 8900 movdl(tmp1, Address(src, 0)); // load 4 byte chars 8901 pmovzxbw(tmp1, tmp1); 8902 movq(Address(dst, 0), tmp1); 8903 subptr(len, 4); 8904 addptr(src, 4); 8905 addptr(dst, 8); 8906 8907 bind(copy_bytes); 8908 } else { 8909 bind(below_threshold); 8910 } 8911 8912 testl(len, len); 8913 jccb(Assembler::zero, done); 8914 lea(src, Address(src, len, Address::times_1)); 8915 lea(dst, Address(dst, len, Address::times_2)); 8916 negptr(len); 8917 8918 // inflate 1 char per iter 8919 bind(copy_chars_loop); 8920 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char 8921 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word 8922 increment(len); 8923 jcc(Assembler::notZero, copy_chars_loop); 8924 8925 bind(done); 8926 } 8927 8928 8929 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 8930 switch(type) { 8931 case T_BYTE: 8932 case T_BOOLEAN: 8933 evmovdqub(dst, kmask, src, merge, vector_len); 8934 break; 8935 case T_CHAR: 8936 case T_SHORT: 8937 evmovdquw(dst, kmask, src, merge, vector_len); 8938 break; 8939 case T_INT: 8940 case T_FLOAT: 8941 evmovdqul(dst, kmask, src, merge, vector_len); 8942 break; 8943 case T_LONG: 8944 case T_DOUBLE: 8945 evmovdquq(dst, kmask, src, merge, vector_len); 8946 break; 8947 default: 8948 fatal("Unexpected type argument %s", type2name(type)); 8949 break; 8950 } 8951 } 8952 8953 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 8954 switch(type) { 8955 case T_BYTE: 8956 case T_BOOLEAN: 8957 evmovdqub(dst, kmask, src, merge, vector_len); 8958 break; 8959 case T_CHAR: 8960 case T_SHORT: 8961 evmovdquw(dst, kmask, src, merge, vector_len); 8962 break; 8963 case T_INT: 8964 case T_FLOAT: 8965 evmovdqul(dst, kmask, src, merge, vector_len); 8966 break; 8967 case T_LONG: 8968 case T_DOUBLE: 8969 evmovdquq(dst, kmask, src, merge, vector_len); 8970 break; 8971 default: 8972 fatal("Unexpected type argument %s", type2name(type)); 8973 break; 8974 } 8975 } 8976 8977 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) { 8978 switch(masklen) { 8979 case 2: 8980 knotbl(dst, src); 8981 movl(rtmp, 3); 8982 kmovbl(ktmp, rtmp); 8983 kandbl(dst, ktmp, dst); 8984 break; 8985 case 4: 8986 knotbl(dst, src); 8987 movl(rtmp, 15); 8988 kmovbl(ktmp, rtmp); 8989 kandbl(dst, ktmp, dst); 8990 break; 8991 case 8: 8992 knotbl(dst, src); 8993 break; 8994 case 16: 8995 knotwl(dst, src); 8996 break; 8997 case 32: 8998 knotdl(dst, src); 8999 break; 9000 case 64: 9001 knotql(dst, src); 9002 break; 9003 default: 9004 fatal("Unexpected vector length %d", masklen); 9005 break; 9006 } 9007 } 9008 9009 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 9010 switch(type) { 9011 case T_BOOLEAN: 9012 case T_BYTE: 9013 kandbl(dst, src1, src2); 9014 break; 9015 case T_CHAR: 9016 case T_SHORT: 9017 kandwl(dst, src1, src2); 9018 break; 9019 case T_INT: 9020 case T_FLOAT: 9021 kanddl(dst, src1, src2); 9022 break; 9023 case T_LONG: 9024 case T_DOUBLE: 9025 kandql(dst, src1, src2); 9026 break; 9027 default: 9028 fatal("Unexpected type argument %s", type2name(type)); 9029 break; 9030 } 9031 } 9032 9033 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 9034 switch(type) { 9035 case T_BOOLEAN: 9036 case T_BYTE: 9037 korbl(dst, src1, src2); 9038 break; 9039 case T_CHAR: 9040 case T_SHORT: 9041 korwl(dst, src1, src2); 9042 break; 9043 case T_INT: 9044 case T_FLOAT: 9045 kordl(dst, src1, src2); 9046 break; 9047 case T_LONG: 9048 case T_DOUBLE: 9049 korql(dst, src1, src2); 9050 break; 9051 default: 9052 fatal("Unexpected type argument %s", type2name(type)); 9053 break; 9054 } 9055 } 9056 9057 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 9058 switch(type) { 9059 case T_BOOLEAN: 9060 case T_BYTE: 9061 kxorbl(dst, src1, src2); 9062 break; 9063 case T_CHAR: 9064 case T_SHORT: 9065 kxorwl(dst, src1, src2); 9066 break; 9067 case T_INT: 9068 case T_FLOAT: 9069 kxordl(dst, src1, src2); 9070 break; 9071 case T_LONG: 9072 case T_DOUBLE: 9073 kxorql(dst, src1, src2); 9074 break; 9075 default: 9076 fatal("Unexpected type argument %s", type2name(type)); 9077 break; 9078 } 9079 } 9080 9081 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9082 switch(type) { 9083 case T_BOOLEAN: 9084 case T_BYTE: 9085 evpermb(dst, mask, nds, src, merge, vector_len); break; 9086 case T_CHAR: 9087 case T_SHORT: 9088 evpermw(dst, mask, nds, src, merge, vector_len); break; 9089 case T_INT: 9090 case T_FLOAT: 9091 evpermd(dst, mask, nds, src, merge, vector_len); break; 9092 case T_LONG: 9093 case T_DOUBLE: 9094 evpermq(dst, mask, nds, src, merge, vector_len); break; 9095 default: 9096 fatal("Unexpected type argument %s", type2name(type)); break; 9097 } 9098 } 9099 9100 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9101 switch(type) { 9102 case T_BOOLEAN: 9103 case T_BYTE: 9104 evpermb(dst, mask, nds, src, merge, vector_len); break; 9105 case T_CHAR: 9106 case T_SHORT: 9107 evpermw(dst, mask, nds, src, merge, vector_len); break; 9108 case T_INT: 9109 case T_FLOAT: 9110 evpermd(dst, mask, nds, src, merge, vector_len); break; 9111 case T_LONG: 9112 case T_DOUBLE: 9113 evpermq(dst, mask, nds, src, merge, vector_len); break; 9114 default: 9115 fatal("Unexpected type argument %s", type2name(type)); break; 9116 } 9117 } 9118 9119 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9120 switch(type) { 9121 case T_BYTE: 9122 evpminsb(dst, mask, nds, src, merge, vector_len); break; 9123 case T_SHORT: 9124 evpminsw(dst, mask, nds, src, merge, vector_len); break; 9125 case T_INT: 9126 evpminsd(dst, mask, nds, src, merge, vector_len); break; 9127 case T_LONG: 9128 evpminsq(dst, mask, nds, src, merge, vector_len); break; 9129 default: 9130 fatal("Unexpected type argument %s", type2name(type)); break; 9131 } 9132 } 9133 9134 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9135 switch(type) { 9136 case T_BYTE: 9137 evpmaxsb(dst, mask, nds, src, merge, vector_len); break; 9138 case T_SHORT: 9139 evpmaxsw(dst, mask, nds, src, merge, vector_len); break; 9140 case T_INT: 9141 evpmaxsd(dst, mask, nds, src, merge, vector_len); break; 9142 case T_LONG: 9143 evpmaxsq(dst, mask, nds, src, merge, vector_len); break; 9144 default: 9145 fatal("Unexpected type argument %s", type2name(type)); break; 9146 } 9147 } 9148 9149 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9150 switch(type) { 9151 case T_BYTE: 9152 evpminsb(dst, mask, nds, src, merge, vector_len); break; 9153 case T_SHORT: 9154 evpminsw(dst, mask, nds, src, merge, vector_len); break; 9155 case T_INT: 9156 evpminsd(dst, mask, nds, src, merge, vector_len); break; 9157 case T_LONG: 9158 evpminsq(dst, mask, nds, src, merge, vector_len); break; 9159 default: 9160 fatal("Unexpected type argument %s", type2name(type)); break; 9161 } 9162 } 9163 9164 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9165 switch(type) { 9166 case T_BYTE: 9167 evpmaxsb(dst, mask, nds, src, merge, vector_len); break; 9168 case T_SHORT: 9169 evpmaxsw(dst, mask, nds, src, merge, vector_len); break; 9170 case T_INT: 9171 evpmaxsd(dst, mask, nds, src, merge, vector_len); break; 9172 case T_LONG: 9173 evpmaxsq(dst, mask, nds, src, merge, vector_len); break; 9174 default: 9175 fatal("Unexpected type argument %s", type2name(type)); break; 9176 } 9177 } 9178 9179 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9180 switch(type) { 9181 case T_INT: 9182 evpxord(dst, mask, nds, src, merge, vector_len); break; 9183 case T_LONG: 9184 evpxorq(dst, mask, nds, src, merge, vector_len); break; 9185 default: 9186 fatal("Unexpected type argument %s", type2name(type)); break; 9187 } 9188 } 9189 9190 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9191 switch(type) { 9192 case T_INT: 9193 evpxord(dst, mask, nds, src, merge, vector_len); break; 9194 case T_LONG: 9195 evpxorq(dst, mask, nds, src, merge, vector_len); break; 9196 default: 9197 fatal("Unexpected type argument %s", type2name(type)); break; 9198 } 9199 } 9200 9201 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9202 switch(type) { 9203 case T_INT: 9204 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break; 9205 case T_LONG: 9206 evporq(dst, mask, nds, src, merge, vector_len); break; 9207 default: 9208 fatal("Unexpected type argument %s", type2name(type)); break; 9209 } 9210 } 9211 9212 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9213 switch(type) { 9214 case T_INT: 9215 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break; 9216 case T_LONG: 9217 evporq(dst, mask, nds, src, merge, vector_len); break; 9218 default: 9219 fatal("Unexpected type argument %s", type2name(type)); break; 9220 } 9221 } 9222 9223 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9224 switch(type) { 9225 case T_INT: 9226 evpandd(dst, mask, nds, src, merge, vector_len); break; 9227 case T_LONG: 9228 evpandq(dst, mask, nds, src, merge, vector_len); break; 9229 default: 9230 fatal("Unexpected type argument %s", type2name(type)); break; 9231 } 9232 } 9233 9234 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9235 switch(type) { 9236 case T_INT: 9237 evpandd(dst, mask, nds, src, merge, vector_len); break; 9238 case T_LONG: 9239 evpandq(dst, mask, nds, src, merge, vector_len); break; 9240 default: 9241 fatal("Unexpected type argument %s", type2name(type)); break; 9242 } 9243 } 9244 9245 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) { 9246 switch(masklen) { 9247 case 8: 9248 kortestbl(src1, src2); 9249 break; 9250 case 16: 9251 kortestwl(src1, src2); 9252 break; 9253 case 32: 9254 kortestdl(src1, src2); 9255 break; 9256 case 64: 9257 kortestql(src1, src2); 9258 break; 9259 default: 9260 fatal("Unexpected mask length %d", masklen); 9261 break; 9262 } 9263 } 9264 9265 9266 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) { 9267 switch(masklen) { 9268 case 8: 9269 ktestbl(src1, src2); 9270 break; 9271 case 16: 9272 ktestwl(src1, src2); 9273 break; 9274 case 32: 9275 ktestdl(src1, src2); 9276 break; 9277 case 64: 9278 ktestql(src1, src2); 9279 break; 9280 default: 9281 fatal("Unexpected mask length %d", masklen); 9282 break; 9283 } 9284 } 9285 9286 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) { 9287 switch(type) { 9288 case T_INT: 9289 evprold(dst, mask, src, shift, merge, vlen_enc); break; 9290 case T_LONG: 9291 evprolq(dst, mask, src, shift, merge, vlen_enc); break; 9292 default: 9293 fatal("Unexpected type argument %s", type2name(type)); break; 9294 break; 9295 } 9296 } 9297 9298 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) { 9299 switch(type) { 9300 case T_INT: 9301 evprord(dst, mask, src, shift, merge, vlen_enc); break; 9302 case T_LONG: 9303 evprorq(dst, mask, src, shift, merge, vlen_enc); break; 9304 default: 9305 fatal("Unexpected type argument %s", type2name(type)); break; 9306 } 9307 } 9308 9309 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 9310 switch(type) { 9311 case T_INT: 9312 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break; 9313 case T_LONG: 9314 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break; 9315 default: 9316 fatal("Unexpected type argument %s", type2name(type)); break; 9317 } 9318 } 9319 9320 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 9321 switch(type) { 9322 case T_INT: 9323 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break; 9324 case T_LONG: 9325 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break; 9326 default: 9327 fatal("Unexpected type argument %s", type2name(type)); break; 9328 } 9329 } 9330 9331 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 9332 assert(rscratch != noreg || always_reachable(src), "missing"); 9333 9334 if (reachable(src)) { 9335 evpandq(dst, nds, as_Address(src), vector_len); 9336 } else { 9337 lea(rscratch, src); 9338 evpandq(dst, nds, Address(rscratch, 0), vector_len); 9339 } 9340 } 9341 9342 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 9343 assert(rscratch != noreg || always_reachable(src), "missing"); 9344 9345 if (reachable(src)) { 9346 evporq(dst, nds, as_Address(src), vector_len); 9347 } else { 9348 lea(rscratch, src); 9349 evporq(dst, nds, Address(rscratch, 0), vector_len); 9350 } 9351 } 9352 9353 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) { 9354 assert(rscratch != noreg || always_reachable(src3), "missing"); 9355 9356 if (reachable(src3)) { 9357 vpternlogq(dst, imm8, src2, as_Address(src3), vector_len); 9358 } else { 9359 lea(rscratch, src3); 9360 vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len); 9361 } 9362 } 9363 9364 #if COMPILER2_OR_JVMCI 9365 9366 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask, 9367 Register length, Register temp, int vec_enc) { 9368 // Computing mask for predicated vector store. 9369 movptr(temp, -1); 9370 bzhiq(temp, temp, length); 9371 kmov(mask, temp); 9372 evmovdqu(bt, mask, dst, xmm, true, vec_enc); 9373 } 9374 9375 // Set memory operation for length "less than" 64 bytes. 9376 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp, 9377 XMMRegister xmm, KRegister mask, Register length, 9378 Register temp, bool use64byteVector) { 9379 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 9380 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 9381 if (!use64byteVector) { 9382 fill32(dst, disp, xmm); 9383 subptr(length, 32 >> shift); 9384 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp); 9385 } else { 9386 assert(MaxVectorSize == 64, "vector length != 64"); 9387 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit); 9388 } 9389 } 9390 9391 9392 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp, 9393 XMMRegister xmm, KRegister mask, Register length, 9394 Register temp) { 9395 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 9396 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 9397 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit); 9398 } 9399 9400 9401 void MacroAssembler::fill32(Address dst, XMMRegister xmm) { 9402 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 9403 vmovdqu(dst, xmm); 9404 } 9405 9406 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) { 9407 fill32(Address(dst, disp), xmm); 9408 } 9409 9410 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) { 9411 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 9412 if (!use64byteVector) { 9413 fill32(dst, xmm); 9414 fill32(dst.plus_disp(32), xmm); 9415 } else { 9416 evmovdquq(dst, xmm, Assembler::AVX_512bit); 9417 } 9418 } 9419 9420 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) { 9421 fill64(Address(dst, disp), xmm, use64byteVector); 9422 } 9423 9424 #ifdef _LP64 9425 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value, 9426 Register count, Register rtmp, XMMRegister xtmp) { 9427 Label L_exit; 9428 Label L_fill_start; 9429 Label L_fill_64_bytes; 9430 Label L_fill_96_bytes; 9431 Label L_fill_128_bytes; 9432 Label L_fill_128_bytes_loop; 9433 Label L_fill_128_loop_header; 9434 Label L_fill_128_bytes_loop_header; 9435 Label L_fill_128_bytes_loop_pre_header; 9436 Label L_fill_zmm_sequence; 9437 9438 int shift = -1; 9439 int avx3threshold = VM_Version::avx3_threshold(); 9440 switch(type) { 9441 case T_BYTE: shift = 0; 9442 break; 9443 case T_SHORT: shift = 1; 9444 break; 9445 case T_INT: shift = 2; 9446 break; 9447 /* Uncomment when LONG fill stubs are supported. 9448 case T_LONG: shift = 3; 9449 break; 9450 */ 9451 default: 9452 fatal("Unhandled type: %s\n", type2name(type)); 9453 } 9454 9455 if ((avx3threshold != 0) || (MaxVectorSize == 32)) { 9456 9457 if (MaxVectorSize == 64) { 9458 cmpq(count, avx3threshold >> shift); 9459 jcc(Assembler::greater, L_fill_zmm_sequence); 9460 } 9461 9462 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit); 9463 9464 bind(L_fill_start); 9465 9466 cmpq(count, 32 >> shift); 9467 jccb(Assembler::greater, L_fill_64_bytes); 9468 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp); 9469 jmp(L_exit); 9470 9471 bind(L_fill_64_bytes); 9472 cmpq(count, 64 >> shift); 9473 jccb(Assembler::greater, L_fill_96_bytes); 9474 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp); 9475 jmp(L_exit); 9476 9477 bind(L_fill_96_bytes); 9478 cmpq(count, 96 >> shift); 9479 jccb(Assembler::greater, L_fill_128_bytes); 9480 fill64(to, 0, xtmp); 9481 subq(count, 64 >> shift); 9482 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp); 9483 jmp(L_exit); 9484 9485 bind(L_fill_128_bytes); 9486 cmpq(count, 128 >> shift); 9487 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header); 9488 fill64(to, 0, xtmp); 9489 fill32(to, 64, xtmp); 9490 subq(count, 96 >> shift); 9491 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp); 9492 jmp(L_exit); 9493 9494 bind(L_fill_128_bytes_loop_pre_header); 9495 { 9496 mov(rtmp, to); 9497 andq(rtmp, 31); 9498 jccb(Assembler::zero, L_fill_128_bytes_loop_header); 9499 negq(rtmp); 9500 addq(rtmp, 32); 9501 mov64(r8, -1L); 9502 bzhiq(r8, r8, rtmp); 9503 kmovql(k2, r8); 9504 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit); 9505 addq(to, rtmp); 9506 shrq(rtmp, shift); 9507 subq(count, rtmp); 9508 } 9509 9510 cmpq(count, 128 >> shift); 9511 jcc(Assembler::less, L_fill_start); 9512 9513 bind(L_fill_128_bytes_loop_header); 9514 subq(count, 128 >> shift); 9515 9516 align32(); 9517 bind(L_fill_128_bytes_loop); 9518 fill64(to, 0, xtmp); 9519 fill64(to, 64, xtmp); 9520 addq(to, 128); 9521 subq(count, 128 >> shift); 9522 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop); 9523 9524 addq(count, 128 >> shift); 9525 jcc(Assembler::zero, L_exit); 9526 jmp(L_fill_start); 9527 } 9528 9529 if (MaxVectorSize == 64) { 9530 // Sequence using 64 byte ZMM register. 9531 Label L_fill_128_bytes_zmm; 9532 Label L_fill_192_bytes_zmm; 9533 Label L_fill_192_bytes_loop_zmm; 9534 Label L_fill_192_bytes_loop_header_zmm; 9535 Label L_fill_192_bytes_loop_pre_header_zmm; 9536 Label L_fill_start_zmm_sequence; 9537 9538 bind(L_fill_zmm_sequence); 9539 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit); 9540 9541 bind(L_fill_start_zmm_sequence); 9542 cmpq(count, 64 >> shift); 9543 jccb(Assembler::greater, L_fill_128_bytes_zmm); 9544 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true); 9545 jmp(L_exit); 9546 9547 bind(L_fill_128_bytes_zmm); 9548 cmpq(count, 128 >> shift); 9549 jccb(Assembler::greater, L_fill_192_bytes_zmm); 9550 fill64(to, 0, xtmp, true); 9551 subq(count, 64 >> shift); 9552 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true); 9553 jmp(L_exit); 9554 9555 bind(L_fill_192_bytes_zmm); 9556 cmpq(count, 192 >> shift); 9557 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); 9558 fill64(to, 0, xtmp, true); 9559 fill64(to, 64, xtmp, true); 9560 subq(count, 128 >> shift); 9561 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true); 9562 jmp(L_exit); 9563 9564 bind(L_fill_192_bytes_loop_pre_header_zmm); 9565 { 9566 movq(rtmp, to); 9567 andq(rtmp, 63); 9568 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm); 9569 negq(rtmp); 9570 addq(rtmp, 64); 9571 mov64(r8, -1L); 9572 bzhiq(r8, r8, rtmp); 9573 kmovql(k2, r8); 9574 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit); 9575 addq(to, rtmp); 9576 shrq(rtmp, shift); 9577 subq(count, rtmp); 9578 } 9579 9580 cmpq(count, 192 >> shift); 9581 jcc(Assembler::less, L_fill_start_zmm_sequence); 9582 9583 bind(L_fill_192_bytes_loop_header_zmm); 9584 subq(count, 192 >> shift); 9585 9586 align32(); 9587 bind(L_fill_192_bytes_loop_zmm); 9588 fill64(to, 0, xtmp, true); 9589 fill64(to, 64, xtmp, true); 9590 fill64(to, 128, xtmp, true); 9591 addq(to, 192); 9592 subq(count, 192 >> shift); 9593 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm); 9594 9595 addq(count, 192 >> shift); 9596 jcc(Assembler::zero, L_exit); 9597 jmp(L_fill_start_zmm_sequence); 9598 } 9599 bind(L_exit); 9600 } 9601 #endif 9602 #endif //COMPILER2_OR_JVMCI 9603 9604 9605 #ifdef _LP64 9606 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) { 9607 Label done; 9608 cvttss2sil(dst, src); 9609 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 9610 cmpl(dst, 0x80000000); // float_sign_flip 9611 jccb(Assembler::notEqual, done); 9612 subptr(rsp, 8); 9613 movflt(Address(rsp, 0), src); 9614 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup()))); 9615 pop(dst); 9616 bind(done); 9617 } 9618 9619 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) { 9620 Label done; 9621 cvttsd2sil(dst, src); 9622 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 9623 cmpl(dst, 0x80000000); // float_sign_flip 9624 jccb(Assembler::notEqual, done); 9625 subptr(rsp, 8); 9626 movdbl(Address(rsp, 0), src); 9627 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup()))); 9628 pop(dst); 9629 bind(done); 9630 } 9631 9632 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) { 9633 Label done; 9634 cvttss2siq(dst, src); 9635 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 9636 jccb(Assembler::notEqual, done); 9637 subptr(rsp, 8); 9638 movflt(Address(rsp, 0), src); 9639 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup()))); 9640 pop(dst); 9641 bind(done); 9642 } 9643 9644 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) { 9645 // Following code is line by line assembly translation rounding algorithm. 9646 // Please refer to java.lang.Math.round(float) algorithm for details. 9647 const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000; 9648 const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24; 9649 const int32_t FloatConsts_EXP_BIAS = 127; 9650 const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF; 9651 const int32_t MINUS_32 = 0xFFFFFFE0; 9652 Label L_special_case, L_block1, L_exit; 9653 movl(rtmp, FloatConsts_EXP_BIT_MASK); 9654 movdl(dst, src); 9655 andl(dst, rtmp); 9656 sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1); 9657 movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS); 9658 subl(rtmp, dst); 9659 movl(rcx, rtmp); 9660 movl(dst, MINUS_32); 9661 testl(rtmp, dst); 9662 jccb(Assembler::notEqual, L_special_case); 9663 movdl(dst, src); 9664 andl(dst, FloatConsts_SIGNIF_BIT_MASK); 9665 orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1); 9666 movdl(rtmp, src); 9667 testl(rtmp, rtmp); 9668 jccb(Assembler::greaterEqual, L_block1); 9669 negl(dst); 9670 bind(L_block1); 9671 sarl(dst); 9672 addl(dst, 0x1); 9673 sarl(dst, 0x1); 9674 jmp(L_exit); 9675 bind(L_special_case); 9676 convert_f2i(dst, src); 9677 bind(L_exit); 9678 } 9679 9680 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) { 9681 // Following code is line by line assembly translation rounding algorithm. 9682 // Please refer to java.lang.Math.round(double) algorithm for details. 9683 const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L; 9684 const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53; 9685 const int64_t DoubleConsts_EXP_BIAS = 1023; 9686 const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL; 9687 const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L; 9688 Label L_special_case, L_block1, L_exit; 9689 mov64(rtmp, DoubleConsts_EXP_BIT_MASK); 9690 movq(dst, src); 9691 andq(dst, rtmp); 9692 sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1); 9693 mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS); 9694 subq(rtmp, dst); 9695 movq(rcx, rtmp); 9696 mov64(dst, MINUS_64); 9697 testq(rtmp, dst); 9698 jccb(Assembler::notEqual, L_special_case); 9699 movq(dst, src); 9700 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK); 9701 andq(dst, rtmp); 9702 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1); 9703 orq(dst, rtmp); 9704 movq(rtmp, src); 9705 testq(rtmp, rtmp); 9706 jccb(Assembler::greaterEqual, L_block1); 9707 negq(dst); 9708 bind(L_block1); 9709 sarq(dst); 9710 addq(dst, 0x1); 9711 sarq(dst, 0x1); 9712 jmp(L_exit); 9713 bind(L_special_case); 9714 convert_d2l(dst, src); 9715 bind(L_exit); 9716 } 9717 9718 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) { 9719 Label done; 9720 cvttsd2siq(dst, src); 9721 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 9722 jccb(Assembler::notEqual, done); 9723 subptr(rsp, 8); 9724 movdbl(Address(rsp, 0), src); 9725 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup()))); 9726 pop(dst); 9727 bind(done); 9728 } 9729 9730 void MacroAssembler::cache_wb(Address line) 9731 { 9732 // 64 bit cpus always support clflush 9733 assert(VM_Version::supports_clflush(), "clflush should be available"); 9734 bool optimized = VM_Version::supports_clflushopt(); 9735 bool no_evict = VM_Version::supports_clwb(); 9736 9737 // prefer clwb (writeback without evict) otherwise 9738 // prefer clflushopt (potentially parallel writeback with evict) 9739 // otherwise fallback on clflush (serial writeback with evict) 9740 9741 if (optimized) { 9742 if (no_evict) { 9743 clwb(line); 9744 } else { 9745 clflushopt(line); 9746 } 9747 } else { 9748 // no need for fence when using CLFLUSH 9749 clflush(line); 9750 } 9751 } 9752 9753 void MacroAssembler::cache_wbsync(bool is_pre) 9754 { 9755 assert(VM_Version::supports_clflush(), "clflush should be available"); 9756 bool optimized = VM_Version::supports_clflushopt(); 9757 bool no_evict = VM_Version::supports_clwb(); 9758 9759 // pick the correct implementation 9760 9761 if (!is_pre && (optimized || no_evict)) { 9762 // need an sfence for post flush when using clflushopt or clwb 9763 // otherwise no no need for any synchroniaztion 9764 9765 sfence(); 9766 } 9767 } 9768 9769 #endif // _LP64 9770 9771 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { 9772 switch (cond) { 9773 // Note some conditions are synonyms for others 9774 case Assembler::zero: return Assembler::notZero; 9775 case Assembler::notZero: return Assembler::zero; 9776 case Assembler::less: return Assembler::greaterEqual; 9777 case Assembler::lessEqual: return Assembler::greater; 9778 case Assembler::greater: return Assembler::lessEqual; 9779 case Assembler::greaterEqual: return Assembler::less; 9780 case Assembler::below: return Assembler::aboveEqual; 9781 case Assembler::belowEqual: return Assembler::above; 9782 case Assembler::above: return Assembler::belowEqual; 9783 case Assembler::aboveEqual: return Assembler::below; 9784 case Assembler::overflow: return Assembler::noOverflow; 9785 case Assembler::noOverflow: return Assembler::overflow; 9786 case Assembler::negative: return Assembler::positive; 9787 case Assembler::positive: return Assembler::negative; 9788 case Assembler::parity: return Assembler::noParity; 9789 case Assembler::noParity: return Assembler::parity; 9790 } 9791 ShouldNotReachHere(); return Assembler::overflow; 9792 } 9793 9794 SkipIfEqual::SkipIfEqual( 9795 MacroAssembler* masm, const bool* flag_addr, bool value, Register rscratch) { 9796 _masm = masm; 9797 _masm->cmp8(ExternalAddress((address)flag_addr), value, rscratch); 9798 _masm->jcc(Assembler::equal, _label); 9799 } 9800 9801 SkipIfEqual::~SkipIfEqual() { 9802 _masm->bind(_label); 9803 } 9804 9805 // 32-bit Windows has its own fast-path implementation 9806 // of get_thread 9807 #if !defined(WIN32) || defined(_LP64) 9808 9809 // This is simply a call to Thread::current() 9810 void MacroAssembler::get_thread(Register thread) { 9811 if (thread != rax) { 9812 push(rax); 9813 } 9814 LP64_ONLY(push(rdi);) 9815 LP64_ONLY(push(rsi);) 9816 push(rdx); 9817 push(rcx); 9818 #ifdef _LP64 9819 push(r8); 9820 push(r9); 9821 push(r10); 9822 push(r11); 9823 #endif 9824 9825 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0); 9826 9827 #ifdef _LP64 9828 pop(r11); 9829 pop(r10); 9830 pop(r9); 9831 pop(r8); 9832 #endif 9833 pop(rcx); 9834 pop(rdx); 9835 LP64_ONLY(pop(rsi);) 9836 LP64_ONLY(pop(rdi);) 9837 if (thread != rax) { 9838 mov(thread, rax); 9839 pop(rax); 9840 } 9841 } 9842 9843 9844 #endif // !WIN32 || _LP64 9845 9846 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) { 9847 Label L_stack_ok; 9848 if (bias == 0) { 9849 testptr(sp, 2 * wordSize - 1); 9850 } else { 9851 // lea(tmp, Address(rsp, bias); 9852 mov(tmp, sp); 9853 addptr(tmp, bias); 9854 testptr(tmp, 2 * wordSize - 1); 9855 } 9856 jcc(Assembler::equal, L_stack_ok); 9857 block_comment(msg); 9858 stop(msg); 9859 bind(L_stack_ok); 9860 } 9861 9862 void MacroAssembler::fast_lock_impl(Register obj, Register hdr, Register thread, Register tmp, Label& slow, bool rt_check_stack) { 9863 assert(hdr == rax, "header must be in rax for cmpxchg"); 9864 assert_different_registers(obj, hdr, thread, tmp); 9865 9866 // First we need to check if the lock-stack has room for pushing the object reference. 9867 if (rt_check_stack) { 9868 movptr(tmp, Address(thread, JavaThread::lock_stack_current_offset())); 9869 cmpptr(tmp, Address(thread, JavaThread::lock_stack_limit_offset())); 9870 jcc(Assembler::greaterEqual, slow); 9871 } 9872 #ifdef ASSERT 9873 else { 9874 Label ok; 9875 movptr(tmp, Address(thread, JavaThread::lock_stack_current_offset())); 9876 cmpptr(tmp, Address(thread, JavaThread::lock_stack_limit_offset())); 9877 jcc(Assembler::less, ok); 9878 stop("Not enough room in lock stack; should have been checked in the method prologue"); 9879 bind(ok); 9880 } 9881 #endif 9882 9883 // Now we attempt to take the fast-lock. 9884 // Clear lowest two header bits (locked state). 9885 andptr(hdr, ~(int32_t)markWord::lock_mask_in_place); 9886 movptr(tmp, hdr); 9887 // Set lowest bit (unlocked state). 9888 orptr(hdr, markWord::unlocked_value); 9889 lock(); 9890 cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes())); 9891 jcc(Assembler::notEqual, slow); 9892 9893 // If successful, push object to lock-stack. 9894 movptr(tmp, Address(thread, JavaThread::lock_stack_current_offset())); 9895 movptr(Address(tmp, 0), obj); 9896 increment(tmp, oopSize); 9897 movptr(Address(thread, JavaThread::lock_stack_current_offset()), tmp); 9898 } 9899 9900 void MacroAssembler::fast_unlock_impl(Register obj, Register hdr, Register tmp, Label& slow) { 9901 assert(hdr == rax, "header must be in rax for cmpxchg"); 9902 assert_different_registers(obj, hdr, tmp); 9903 9904 // Mark-word must be 00 now, try to swing it back to 01 (unlocked) 9905 movptr(tmp, hdr); // The expected old value 9906 orptr(tmp, markWord::unlocked_value); 9907 lock(); 9908 cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes())); 9909 jcc(Assembler::notEqual, slow); 9910 // Pop the lock object from the lock-stack. 9911 #ifdef _LP64 9912 const Register thread = r15_thread; 9913 #else 9914 const Register thread = rax; 9915 get_thread(rax); 9916 #endif 9917 subptr(Address(thread, JavaThread::lock_stack_current_offset()), oopSize); 9918 }