1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "compiler/compiler_globals.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "crc32c.h"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "jvm.h"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/continuation.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/javaThread.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/objectMonitor.hpp"
  49 #include "runtime/os.hpp"
  50 #include "runtime/safepoint.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/macros.hpp"
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #define STOP(error) stop(error)
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #define STOP(error) block_comment(error); stop(error)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 #ifdef ASSERT
  67 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  68 #endif
  69 
  70 static const Assembler::Condition reverse[] = {
  71     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  72     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  73     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  74     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  75     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  76     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  77     Assembler::above          /* belowEqual    = 0x6 */ ,
  78     Assembler::belowEqual     /* above         = 0x7 */ ,
  79     Assembler::positive       /* negative      = 0x8 */ ,
  80     Assembler::negative       /* positive      = 0x9 */ ,
  81     Assembler::noParity       /* parity        = 0xa */ ,
  82     Assembler::parity         /* noParity      = 0xb */ ,
  83     Assembler::greaterEqual   /* less          = 0xc */ ,
  84     Assembler::less           /* greaterEqual  = 0xd */ ,
  85     Assembler::greater        /* lessEqual     = 0xe */ ,
  86     Assembler::lessEqual      /* greater       = 0xf, */
  87 
  88 };
  89 
  90 
  91 // Implementation of MacroAssembler
  92 
  93 // First all the versions that have distinct versions depending on 32/64 bit
  94 // Unless the difference is trivial (1 line or so).
  95 
  96 #ifndef _LP64
  97 
  98 // 32bit versions
  99 
 100 Address MacroAssembler::as_Address(AddressLiteral adr) {
 101   return Address(adr.target(), adr.rspec());
 102 }
 103 
 104 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
 105   assert(rscratch == noreg, "");
 106   return Address::make_array(adr);
 107 }
 108 
 109 void MacroAssembler::call_VM_leaf_base(address entry_point,
 110                                        int number_of_arguments) {
 111   call(RuntimeAddress(entry_point));
 112   increment(rsp, number_of_arguments * wordSize);
 113 }
 114 
 115 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 116   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 117 }
 118 
 119 
 120 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 121   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 122 }
 123 
 124 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 125   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 126 }
 127 
 128 void MacroAssembler::cmpoop(Register src1, jobject obj, Register rscratch) {
 129   assert(rscratch == noreg, "redundant");
 130   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 131 }
 132 
 133 void MacroAssembler::extend_sign(Register hi, Register lo) {
 134   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 135   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 136     cdql();
 137   } else {
 138     movl(hi, lo);
 139     sarl(hi, 31);
 140   }
 141 }
 142 
 143 void MacroAssembler::jC2(Register tmp, Label& L) {
 144   // set parity bit if FPU flag C2 is set (via rax)
 145   save_rax(tmp);
 146   fwait(); fnstsw_ax();
 147   sahf();
 148   restore_rax(tmp);
 149   // branch
 150   jcc(Assembler::parity, L);
 151 }
 152 
 153 void MacroAssembler::jnC2(Register tmp, Label& L) {
 154   // set parity bit if FPU flag C2 is set (via rax)
 155   save_rax(tmp);
 156   fwait(); fnstsw_ax();
 157   sahf();
 158   restore_rax(tmp);
 159   // branch
 160   jcc(Assembler::noParity, L);
 161 }
 162 
 163 // 32bit can do a case table jump in one instruction but we no longer allow the base
 164 // to be installed in the Address class
 165 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
 166   assert(rscratch == noreg, "not needed");
 167   jmp(as_Address(entry, noreg));
 168 }
 169 
 170 // Note: y_lo will be destroyed
 171 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 172   // Long compare for Java (semantics as described in JVM spec.)
 173   Label high, low, done;
 174 
 175   cmpl(x_hi, y_hi);
 176   jcc(Assembler::less, low);
 177   jcc(Assembler::greater, high);
 178   // x_hi is the return register
 179   xorl(x_hi, x_hi);
 180   cmpl(x_lo, y_lo);
 181   jcc(Assembler::below, low);
 182   jcc(Assembler::equal, done);
 183 
 184   bind(high);
 185   xorl(x_hi, x_hi);
 186   increment(x_hi);
 187   jmp(done);
 188 
 189   bind(low);
 190   xorl(x_hi, x_hi);
 191   decrementl(x_hi);
 192 
 193   bind(done);
 194 }
 195 
 196 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 197   mov_literal32(dst, (int32_t)src.target(), src.rspec());
 198 }
 199 
 200 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
 201   assert(rscratch == noreg, "not needed");
 202 
 203   // leal(dst, as_Address(adr));
 204   // see note in movl as to why we must use a move
 205   mov_literal32(dst, (int32_t)adr.target(), adr.rspec());
 206 }
 207 
 208 void MacroAssembler::leave() {
 209   mov(rsp, rbp);
 210   pop(rbp);
 211 }
 212 
 213 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 214   // Multiplication of two Java long values stored on the stack
 215   // as illustrated below. Result is in rdx:rax.
 216   //
 217   // rsp ---> [  ??  ] \               \
 218   //            ....    | y_rsp_offset  |
 219   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 220   //          [ y_hi ]                  | (in bytes)
 221   //            ....                    |
 222   //          [ x_lo ]                 /
 223   //          [ x_hi ]
 224   //            ....
 225   //
 226   // Basic idea: lo(result) = lo(x_lo * y_lo)
 227   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 228   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 229   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 230   Label quick;
 231   // load x_hi, y_hi and check if quick
 232   // multiplication is possible
 233   movl(rbx, x_hi);
 234   movl(rcx, y_hi);
 235   movl(rax, rbx);
 236   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 237   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 238   // do full multiplication
 239   // 1st step
 240   mull(y_lo);                                    // x_hi * y_lo
 241   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 242   // 2nd step
 243   movl(rax, x_lo);
 244   mull(rcx);                                     // x_lo * y_hi
 245   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 246   // 3rd step
 247   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 248   movl(rax, x_lo);
 249   mull(y_lo);                                    // x_lo * y_lo
 250   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 251 }
 252 
 253 void MacroAssembler::lneg(Register hi, Register lo) {
 254   negl(lo);
 255   adcl(hi, 0);
 256   negl(hi);
 257 }
 258 
 259 void MacroAssembler::lshl(Register hi, Register lo) {
 260   // Java shift left long support (semantics as described in JVM spec., p.305)
 261   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 262   // shift value is in rcx !
 263   assert(hi != rcx, "must not use rcx");
 264   assert(lo != rcx, "must not use rcx");
 265   const Register s = rcx;                        // shift count
 266   const int      n = BitsPerWord;
 267   Label L;
 268   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 269   cmpl(s, n);                                    // if (s < n)
 270   jcc(Assembler::less, L);                       // else (s >= n)
 271   movl(hi, lo);                                  // x := x << n
 272   xorl(lo, lo);
 273   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 274   bind(L);                                       // s (mod n) < n
 275   shldl(hi, lo);                                 // x := x << s
 276   shll(lo);
 277 }
 278 
 279 
 280 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 281   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 282   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 283   assert(hi != rcx, "must not use rcx");
 284   assert(lo != rcx, "must not use rcx");
 285   const Register s = rcx;                        // shift count
 286   const int      n = BitsPerWord;
 287   Label L;
 288   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 289   cmpl(s, n);                                    // if (s < n)
 290   jcc(Assembler::less, L);                       // else (s >= n)
 291   movl(lo, hi);                                  // x := x >> n
 292   if (sign_extension) sarl(hi, 31);
 293   else                xorl(hi, hi);
 294   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 295   bind(L);                                       // s (mod n) < n
 296   shrdl(lo, hi);                                 // x := x >> s
 297   if (sign_extension) sarl(hi);
 298   else                shrl(hi);
 299 }
 300 
 301 void MacroAssembler::movoop(Register dst, jobject obj) {
 302   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 303 }
 304 
 305 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
 306   assert(rscratch == noreg, "redundant");
 307   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 308 }
 309 
 310 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 311   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 312 }
 313 
 314 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
 315   assert(rscratch == noreg, "redundant");
 316   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 317 }
 318 
 319 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
 320   if (src.is_lval()) {
 321     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 322   } else {
 323     movl(dst, as_Address(src));
 324   }
 325 }
 326 
 327 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
 328   assert(rscratch == noreg, "redundant");
 329   movl(as_Address(dst, noreg), src);
 330 }
 331 
 332 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 333   movl(dst, as_Address(src, noreg));
 334 }
 335 
 336 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
 337   assert(rscratch == noreg, "redundant");
 338   movl(dst, src);
 339 }
 340 
 341 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
 342   assert(rscratch == noreg, "redundant");
 343   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 344 }
 345 
 346 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
 347   assert(rscratch == noreg, "redundant");
 348   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 349 }
 350 
 351 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
 352   assert(rscratch == noreg, "redundant");
 353   if (src.is_lval()) {
 354     push_literal32((int32_t)src.target(), src.rspec());
 355   } else {
 356     pushl(as_Address(src));
 357   }
 358 }
 359 
 360 static void pass_arg0(MacroAssembler* masm, Register arg) {
 361   masm->push(arg);
 362 }
 363 
 364 static void pass_arg1(MacroAssembler* masm, Register arg) {
 365   masm->push(arg);
 366 }
 367 
 368 static void pass_arg2(MacroAssembler* masm, Register arg) {
 369   masm->push(arg);
 370 }
 371 
 372 static void pass_arg3(MacroAssembler* masm, Register arg) {
 373   masm->push(arg);
 374 }
 375 
 376 #ifndef PRODUCT
 377 extern "C" void findpc(intptr_t x);
 378 #endif
 379 
 380 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 381   // In order to get locks to work, we need to fake a in_VM state
 382   JavaThread* thread = JavaThread::current();
 383   JavaThreadState saved_state = thread->thread_state();
 384   thread->set_thread_state(_thread_in_vm);
 385   if (ShowMessageBoxOnError) {
 386     JavaThread* thread = JavaThread::current();
 387     JavaThreadState saved_state = thread->thread_state();
 388     thread->set_thread_state(_thread_in_vm);
 389     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 390       ttyLocker ttyl;
 391       BytecodeCounter::print();
 392     }
 393     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 394     // This is the value of eip which points to where verify_oop will return.
 395     if (os::message_box(msg, "Execution stopped, print registers?")) {
 396       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 397       BREAKPOINT;
 398     }
 399   }
 400   fatal("DEBUG MESSAGE: %s", msg);
 401 }
 402 
 403 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 404   ttyLocker ttyl;
 405   DebuggingContext debugging{};
 406   tty->print_cr("eip = 0x%08x", eip);
 407 #ifndef PRODUCT
 408   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 409     tty->cr();
 410     findpc(eip);
 411     tty->cr();
 412   }
 413 #endif
 414 #define PRINT_REG(rax) \
 415   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 416   PRINT_REG(rax);
 417   PRINT_REG(rbx);
 418   PRINT_REG(rcx);
 419   PRINT_REG(rdx);
 420   PRINT_REG(rdi);
 421   PRINT_REG(rsi);
 422   PRINT_REG(rbp);
 423   PRINT_REG(rsp);
 424 #undef PRINT_REG
 425   // Print some words near top of staack.
 426   int* dump_sp = (int*) rsp;
 427   for (int col1 = 0; col1 < 8; col1++) {
 428     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 429     os::print_location(tty, *dump_sp++);
 430   }
 431   for (int row = 0; row < 16; row++) {
 432     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 433     for (int col = 0; col < 8; col++) {
 434       tty->print(" 0x%08x", *dump_sp++);
 435     }
 436     tty->cr();
 437   }
 438   // Print some instructions around pc:
 439   Disassembler::decode((address)eip-64, (address)eip);
 440   tty->print_cr("--------");
 441   Disassembler::decode((address)eip, (address)eip+32);
 442 }
 443 
 444 void MacroAssembler::stop(const char* msg) {
 445   // push address of message
 446   ExternalAddress message((address)msg);
 447   pushptr(message.addr(), noreg);
 448   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 449   pusha();                                            // push registers
 450   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 451   hlt();
 452 }
 453 
 454 void MacroAssembler::warn(const char* msg) {
 455   push_CPU_state();
 456 
 457   // push address of message
 458   ExternalAddress message((address)msg);
 459   pushptr(message.addr(), noreg);
 460 
 461   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 462   addl(rsp, wordSize);       // discard argument
 463   pop_CPU_state();
 464 }
 465 
 466 void MacroAssembler::print_state() {
 467   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 468   pusha();                                            // push registers
 469 
 470   push_CPU_state();
 471   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 472   pop_CPU_state();
 473 
 474   popa();
 475   addl(rsp, wordSize);
 476 }
 477 
 478 #else // _LP64
 479 
 480 // 64 bit versions
 481 
 482 Address MacroAssembler::as_Address(AddressLiteral adr) {
 483   // amd64 always does this as a pc-rel
 484   // we can be absolute or disp based on the instruction type
 485   // jmp/call are displacements others are absolute
 486   assert(!adr.is_lval(), "must be rval");
 487   assert(reachable(adr), "must be");
 488   return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
 489 
 490 }
 491 
 492 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
 493   AddressLiteral base = adr.base();
 494   lea(rscratch, base);
 495   Address index = adr.index();
 496   assert(index._disp == 0, "must not have disp"); // maybe it can?
 497   Address array(rscratch, index._index, index._scale, index._disp);
 498   return array;
 499 }
 500 
 501 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 502   Label L, E;
 503 
 504 #ifdef _WIN64
 505   // Windows always allocates space for it's register args
 506   assert(num_args <= 4, "only register arguments supported");
 507   subq(rsp,  frame::arg_reg_save_area_bytes);
 508 #endif
 509 
 510   // Align stack if necessary
 511   testl(rsp, 15);
 512   jcc(Assembler::zero, L);
 513 
 514   subq(rsp, 8);
 515   call(RuntimeAddress(entry_point));
 516   addq(rsp, 8);
 517   jmp(E);
 518 
 519   bind(L);
 520   call(RuntimeAddress(entry_point));
 521 
 522   bind(E);
 523 
 524 #ifdef _WIN64
 525   // restore stack pointer
 526   addq(rsp, frame::arg_reg_save_area_bytes);
 527 #endif
 528 
 529 }
 530 
 531 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
 532   assert(!src2.is_lval(), "should use cmpptr");
 533   assert(rscratch != noreg || always_reachable(src2), "missing");
 534 
 535   if (reachable(src2)) {
 536     cmpq(src1, as_Address(src2));
 537   } else {
 538     lea(rscratch, src2);
 539     Assembler::cmpq(src1, Address(rscratch, 0));
 540   }
 541 }
 542 
 543 int MacroAssembler::corrected_idivq(Register reg) {
 544   // Full implementation of Java ldiv and lrem; checks for special
 545   // case as described in JVM spec., p.243 & p.271.  The function
 546   // returns the (pc) offset of the idivl instruction - may be needed
 547   // for implicit exceptions.
 548   //
 549   //         normal case                           special case
 550   //
 551   // input : rax: dividend                         min_long
 552   //         reg: divisor   (may not be eax/edx)   -1
 553   //
 554   // output: rax: quotient  (= rax idiv reg)       min_long
 555   //         rdx: remainder (= rax irem reg)       0
 556   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 557   static const int64_t min_long = 0x8000000000000000;
 558   Label normal_case, special_case;
 559 
 560   // check for special case
 561   cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
 562   jcc(Assembler::notEqual, normal_case);
 563   xorl(rdx, rdx); // prepare rdx for possible special case (where
 564                   // remainder = 0)
 565   cmpq(reg, -1);
 566   jcc(Assembler::equal, special_case);
 567 
 568   // handle normal case
 569   bind(normal_case);
 570   cdqq();
 571   int idivq_offset = offset();
 572   idivq(reg);
 573 
 574   // normal and special case exit
 575   bind(special_case);
 576 
 577   return idivq_offset;
 578 }
 579 
 580 void MacroAssembler::decrementq(Register reg, int value) {
 581   if (value == min_jint) { subq(reg, value); return; }
 582   if (value <  0) { incrementq(reg, -value); return; }
 583   if (value == 0) {                        ; return; }
 584   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 585   /* else */      { subq(reg, value)       ; return; }
 586 }
 587 
 588 void MacroAssembler::decrementq(Address dst, int value) {
 589   if (value == min_jint) { subq(dst, value); return; }
 590   if (value <  0) { incrementq(dst, -value); return; }
 591   if (value == 0) {                        ; return; }
 592   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 593   /* else */      { subq(dst, value)       ; return; }
 594 }
 595 
 596 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
 597   assert(rscratch != noreg || always_reachable(dst), "missing");
 598 
 599   if (reachable(dst)) {
 600     incrementq(as_Address(dst));
 601   } else {
 602     lea(rscratch, dst);
 603     incrementq(Address(rscratch, 0));
 604   }
 605 }
 606 
 607 void MacroAssembler::incrementq(Register reg, int value) {
 608   if (value == min_jint) { addq(reg, value); return; }
 609   if (value <  0) { decrementq(reg, -value); return; }
 610   if (value == 0) {                        ; return; }
 611   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 612   /* else */      { addq(reg, value)       ; return; }
 613 }
 614 
 615 void MacroAssembler::incrementq(Address dst, int value) {
 616   if (value == min_jint) { addq(dst, value); return; }
 617   if (value <  0) { decrementq(dst, -value); return; }
 618   if (value == 0) {                        ; return; }
 619   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 620   /* else */      { addq(dst, value)       ; return; }
 621 }
 622 
 623 // 32bit can do a case table jump in one instruction but we no longer allow the base
 624 // to be installed in the Address class
 625 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
 626   lea(rscratch, entry.base());
 627   Address dispatch = entry.index();
 628   assert(dispatch._base == noreg, "must be");
 629   dispatch._base = rscratch;
 630   jmp(dispatch);
 631 }
 632 
 633 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 634   ShouldNotReachHere(); // 64bit doesn't use two regs
 635   cmpq(x_lo, y_lo);
 636 }
 637 
 638 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 639   mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 640 }
 641 
 642 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
 643   lea(rscratch, adr);
 644   movptr(dst, rscratch);
 645 }
 646 
 647 void MacroAssembler::leave() {
 648   // %%% is this really better? Why not on 32bit too?
 649   emit_int8((unsigned char)0xC9); // LEAVE
 650 }
 651 
 652 void MacroAssembler::lneg(Register hi, Register lo) {
 653   ShouldNotReachHere(); // 64bit doesn't use two regs
 654   negq(lo);
 655 }
 656 
 657 void MacroAssembler::movoop(Register dst, jobject obj) {
 658   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 659 }
 660 
 661 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
 662   mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 663   movq(dst, rscratch);
 664 }
 665 
 666 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 667   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 668 }
 669 
 670 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
 671   mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 672   movq(dst, rscratch);
 673 }
 674 
 675 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
 676   if (src.is_lval()) {
 677     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 678   } else {
 679     if (reachable(src)) {
 680       movq(dst, as_Address(src));
 681     } else {
 682       lea(dst, src);
 683       movq(dst, Address(dst, 0));
 684     }
 685   }
 686 }
 687 
 688 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
 689   movq(as_Address(dst, rscratch), src);
 690 }
 691 
 692 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 693   movq(dst, as_Address(src, dst /*rscratch*/));
 694 }
 695 
 696 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 697 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
 698   if (is_simm32(src)) {
 699     movptr(dst, checked_cast<int32_t>(src));
 700   } else {
 701     mov64(rscratch, src);
 702     movq(dst, rscratch);
 703   }
 704 }
 705 
 706 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
 707   movoop(rscratch, obj);
 708   push(rscratch);
 709 }
 710 
 711 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
 712   mov_metadata(rscratch, obj);
 713   push(rscratch);
 714 }
 715 
 716 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
 717   lea(rscratch, src);
 718   if (src.is_lval()) {
 719     push(rscratch);
 720   } else {
 721     pushq(Address(rscratch, 0));
 722   }
 723 }
 724 
 725 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 726   reset_last_Java_frame(r15_thread, clear_fp);
 727 }
 728 
 729 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 730                                          Register last_java_fp,
 731                                          address  last_java_pc,
 732                                          Register rscratch) {
 733   set_last_Java_frame(r15_thread, last_java_sp, last_java_fp, last_java_pc, rscratch);
 734 }
 735 
 736 static void pass_arg0(MacroAssembler* masm, Register arg) {
 737   if (c_rarg0 != arg ) {
 738     masm->mov(c_rarg0, arg);
 739   }
 740 }
 741 
 742 static void pass_arg1(MacroAssembler* masm, Register arg) {
 743   if (c_rarg1 != arg ) {
 744     masm->mov(c_rarg1, arg);
 745   }
 746 }
 747 
 748 static void pass_arg2(MacroAssembler* masm, Register arg) {
 749   if (c_rarg2 != arg ) {
 750     masm->mov(c_rarg2, arg);
 751   }
 752 }
 753 
 754 static void pass_arg3(MacroAssembler* masm, Register arg) {
 755   if (c_rarg3 != arg ) {
 756     masm->mov(c_rarg3, arg);
 757   }
 758 }
 759 
 760 void MacroAssembler::stop(const char* msg) {
 761   if (ShowMessageBoxOnError) {
 762     address rip = pc();
 763     pusha(); // get regs on stack
 764     lea(c_rarg1, InternalAddress(rip));
 765     movq(c_rarg2, rsp); // pass pointer to regs array
 766   }
 767   lea(c_rarg0, ExternalAddress((address) msg));
 768   andq(rsp, -16); // align stack as required by ABI
 769   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 770   hlt();
 771 }
 772 
 773 void MacroAssembler::warn(const char* msg) {
 774   push(rbp);
 775   movq(rbp, rsp);
 776   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 777   push_CPU_state();   // keeps alignment at 16 bytes
 778 
 779   lea(c_rarg0, ExternalAddress((address) msg));
 780   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 781 
 782   pop_CPU_state();
 783   mov(rsp, rbp);
 784   pop(rbp);
 785 }
 786 
 787 void MacroAssembler::print_state() {
 788   address rip = pc();
 789   pusha();            // get regs on stack
 790   push(rbp);
 791   movq(rbp, rsp);
 792   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 793   push_CPU_state();   // keeps alignment at 16 bytes
 794 
 795   lea(c_rarg0, InternalAddress(rip));
 796   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 797   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 798 
 799   pop_CPU_state();
 800   mov(rsp, rbp);
 801   pop(rbp);
 802   popa();
 803 }
 804 
 805 #ifndef PRODUCT
 806 extern "C" void findpc(intptr_t x);
 807 #endif
 808 
 809 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 810   // In order to get locks to work, we need to fake a in_VM state
 811   if (ShowMessageBoxOnError) {
 812     JavaThread* thread = JavaThread::current();
 813     JavaThreadState saved_state = thread->thread_state();
 814     thread->set_thread_state(_thread_in_vm);
 815 #ifndef PRODUCT
 816     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 817       ttyLocker ttyl;
 818       BytecodeCounter::print();
 819     }
 820 #endif
 821     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 822     // XXX correct this offset for amd64
 823     // This is the value of eip which points to where verify_oop will return.
 824     if (os::message_box(msg, "Execution stopped, print registers?")) {
 825       print_state64(pc, regs);
 826       BREAKPOINT;
 827     }
 828   }
 829   fatal("DEBUG MESSAGE: %s", msg);
 830 }
 831 
 832 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 833   ttyLocker ttyl;
 834   DebuggingContext debugging{};
 835   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 836 #ifndef PRODUCT
 837   tty->cr();
 838   findpc(pc);
 839   tty->cr();
 840 #endif
 841 #define PRINT_REG(rax, value) \
 842   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 843   PRINT_REG(rax, regs[15]);
 844   PRINT_REG(rbx, regs[12]);
 845   PRINT_REG(rcx, regs[14]);
 846   PRINT_REG(rdx, regs[13]);
 847   PRINT_REG(rdi, regs[8]);
 848   PRINT_REG(rsi, regs[9]);
 849   PRINT_REG(rbp, regs[10]);
 850   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 851   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 852   PRINT_REG(r8 , regs[7]);
 853   PRINT_REG(r9 , regs[6]);
 854   PRINT_REG(r10, regs[5]);
 855   PRINT_REG(r11, regs[4]);
 856   PRINT_REG(r12, regs[3]);
 857   PRINT_REG(r13, regs[2]);
 858   PRINT_REG(r14, regs[1]);
 859   PRINT_REG(r15, regs[0]);
 860 #undef PRINT_REG
 861   // Print some words near the top of the stack.
 862   int64_t* rsp = &regs[16];
 863   int64_t* dump_sp = rsp;
 864   for (int col1 = 0; col1 < 8; col1++) {
 865     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 866     os::print_location(tty, *dump_sp++);
 867   }
 868   for (int row = 0; row < 25; row++) {
 869     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 870     for (int col = 0; col < 4; col++) {
 871       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 872     }
 873     tty->cr();
 874   }
 875   // Print some instructions around pc:
 876   Disassembler::decode((address)pc-64, (address)pc);
 877   tty->print_cr("--------");
 878   Disassembler::decode((address)pc, (address)pc+32);
 879 }
 880 
 881 // The java_calling_convention describes stack locations as ideal slots on
 882 // a frame with no abi restrictions. Since we must observe abi restrictions
 883 // (like the placement of the register window) the slots must be biased by
 884 // the following value.
 885 static int reg2offset_in(VMReg r) {
 886   // Account for saved rbp and return address
 887   // This should really be in_preserve_stack_slots
 888   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 889 }
 890 
 891 static int reg2offset_out(VMReg r) {
 892   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 893 }
 894 
 895 // A long move
 896 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 897 
 898   // The calling conventions assures us that each VMregpair is either
 899   // all really one physical register or adjacent stack slots.
 900 
 901   if (src.is_single_phys_reg() ) {
 902     if (dst.is_single_phys_reg()) {
 903       if (dst.first() != src.first()) {
 904         mov(dst.first()->as_Register(), src.first()->as_Register());
 905       }
 906     } else {
 907       assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
 908              src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
 909       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
 910     }
 911   } else if (dst.is_single_phys_reg()) {
 912     assert(src.is_single_reg(),  "not a stack pair");
 913     movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 914   } else {
 915     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 916     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 917     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 918   }
 919 }
 920 
 921 // A double move
 922 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 923 
 924   // The calling conventions assures us that each VMregpair is either
 925   // all really one physical register or adjacent stack slots.
 926 
 927   if (src.is_single_phys_reg() ) {
 928     if (dst.is_single_phys_reg()) {
 929       // In theory these overlap but the ordering is such that this is likely a nop
 930       if ( src.first() != dst.first()) {
 931         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 932       }
 933     } else {
 934       assert(dst.is_single_reg(), "not a stack pair");
 935       movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
 936     }
 937   } else if (dst.is_single_phys_reg()) {
 938     assert(src.is_single_reg(),  "not a stack pair");
 939     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 940   } else {
 941     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 942     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 943     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 944   }
 945 }
 946 
 947 
 948 // A float arg may have to do float reg int reg conversion
 949 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 950   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 951 
 952   // The calling conventions assures us that each VMregpair is either
 953   // all really one physical register or adjacent stack slots.
 954 
 955   if (src.first()->is_stack()) {
 956     if (dst.first()->is_stack()) {
 957       movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 958       movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 959     } else {
 960       // stack to reg
 961       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 962       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 963     }
 964   } else if (dst.first()->is_stack()) {
 965     // reg to stack
 966     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 967     movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
 968   } else {
 969     // reg to reg
 970     // In theory these overlap but the ordering is such that this is likely a nop
 971     if ( src.first() != dst.first()) {
 972       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
 973     }
 974   }
 975 }
 976 
 977 // On 64 bit we will store integer like items to the stack as
 978 // 64 bits items (x86_32/64 abi) even though java would only store
 979 // 32bits for a parameter. On 32bit it will simply be 32 bits
 980 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
 981 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 982   if (src.first()->is_stack()) {
 983     if (dst.first()->is_stack()) {
 984       // stack to stack
 985       movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 986       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 987     } else {
 988       // stack to reg
 989       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 990     }
 991   } else if (dst.first()->is_stack()) {
 992     // reg to stack
 993     // Do we really have to sign extend???
 994     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
 995     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
 996   } else {
 997     // Do we really have to sign extend???
 998     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
 999     if (dst.first() != src.first()) {
1000       movq(dst.first()->as_Register(), src.first()->as_Register());
1001     }
1002   }
1003 }
1004 
1005 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1006   if (src.first()->is_stack()) {
1007     if (dst.first()->is_stack()) {
1008       // stack to stack
1009       movq(rax, Address(rbp, reg2offset_in(src.first())));
1010       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1011     } else {
1012       // stack to reg
1013       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1014     }
1015   } else if (dst.first()->is_stack()) {
1016     // reg to stack
1017     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1018   } else {
1019     if (dst.first() != src.first()) {
1020       movq(dst.first()->as_Register(), src.first()->as_Register());
1021     }
1022   }
1023 }
1024 
1025 // An oop arg. Must pass a handle not the oop itself
1026 void MacroAssembler::object_move(OopMap* map,
1027                         int oop_handle_offset,
1028                         int framesize_in_slots,
1029                         VMRegPair src,
1030                         VMRegPair dst,
1031                         bool is_receiver,
1032                         int* receiver_offset) {
1033 
1034   // must pass a handle. First figure out the location we use as a handle
1035 
1036   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1037 
1038   // See if oop is null if it is we need no handle
1039 
1040   if (src.first()->is_stack()) {
1041 
1042     // Oop is already on the stack as an argument
1043     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1044     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1045     if (is_receiver) {
1046       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1047     }
1048 
1049     cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
1050     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1051     // conditionally move a null
1052     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1053   } else {
1054 
1055     // Oop is in a register we must store it to the space we reserve
1056     // on the stack for oop_handles and pass a handle if oop is non-null
1057 
1058     const Register rOop = src.first()->as_Register();
1059     int oop_slot;
1060     if (rOop == j_rarg0)
1061       oop_slot = 0;
1062     else if (rOop == j_rarg1)
1063       oop_slot = 1;
1064     else if (rOop == j_rarg2)
1065       oop_slot = 2;
1066     else if (rOop == j_rarg3)
1067       oop_slot = 3;
1068     else if (rOop == j_rarg4)
1069       oop_slot = 4;
1070     else {
1071       assert(rOop == j_rarg5, "wrong register");
1072       oop_slot = 5;
1073     }
1074 
1075     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1076     int offset = oop_slot*VMRegImpl::stack_slot_size;
1077 
1078     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1079     // Store oop in handle area, may be null
1080     movptr(Address(rsp, offset), rOop);
1081     if (is_receiver) {
1082       *receiver_offset = offset;
1083     }
1084 
1085     cmpptr(rOop, NULL_WORD);
1086     lea(rHandle, Address(rsp, offset));
1087     // conditionally move a null from the handle area where it was just stored
1088     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1089   }
1090 
1091   // If arg is on the stack then place it otherwise it is already in correct reg.
1092   if (dst.first()->is_stack()) {
1093     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1094   }
1095 }
1096 
1097 #endif // _LP64
1098 
1099 // Now versions that are common to 32/64 bit
1100 
1101 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1102   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1103 }
1104 
1105 void MacroAssembler::addptr(Register dst, Register src) {
1106   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1107 }
1108 
1109 void MacroAssembler::addptr(Address dst, Register src) {
1110   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1111 }
1112 
1113 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1114   assert(rscratch != noreg || always_reachable(src), "missing");
1115 
1116   if (reachable(src)) {
1117     Assembler::addsd(dst, as_Address(src));
1118   } else {
1119     lea(rscratch, src);
1120     Assembler::addsd(dst, Address(rscratch, 0));
1121   }
1122 }
1123 
1124 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1125   assert(rscratch != noreg || always_reachable(src), "missing");
1126 
1127   if (reachable(src)) {
1128     addss(dst, as_Address(src));
1129   } else {
1130     lea(rscratch, src);
1131     addss(dst, Address(rscratch, 0));
1132   }
1133 }
1134 
1135 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1136   assert(rscratch != noreg || always_reachable(src), "missing");
1137 
1138   if (reachable(src)) {
1139     Assembler::addpd(dst, as_Address(src));
1140   } else {
1141     lea(rscratch, src);
1142     Assembler::addpd(dst, Address(rscratch, 0));
1143   }
1144 }
1145 
1146 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1147 // Stub code is generated once and never copied.
1148 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1149 void MacroAssembler::align64() {
1150   align(64, (unsigned long long) pc());
1151 }
1152 
1153 void MacroAssembler::align32() {
1154   align(32, (unsigned long long) pc());
1155 }
1156 
1157 void MacroAssembler::align(int modulus) {
1158   // 8273459: Ensure alignment is possible with current segment alignment
1159   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1160   align(modulus, offset());
1161 }
1162 
1163 void MacroAssembler::align(int modulus, int target) {
1164   if (target % modulus != 0) {
1165     nop(modulus - (target % modulus));
1166   }
1167 }
1168 
1169 void MacroAssembler::push_f(XMMRegister r) {
1170   subptr(rsp, wordSize);
1171   movflt(Address(rsp, 0), r);
1172 }
1173 
1174 void MacroAssembler::pop_f(XMMRegister r) {
1175   movflt(r, Address(rsp, 0));
1176   addptr(rsp, wordSize);
1177 }
1178 
1179 void MacroAssembler::push_d(XMMRegister r) {
1180   subptr(rsp, 2 * wordSize);
1181   movdbl(Address(rsp, 0), r);
1182 }
1183 
1184 void MacroAssembler::pop_d(XMMRegister r) {
1185   movdbl(r, Address(rsp, 0));
1186   addptr(rsp, 2 * Interpreter::stackElementSize);
1187 }
1188 
1189 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1190   // Used in sign-masking with aligned address.
1191   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1192   assert(rscratch != noreg || always_reachable(src), "missing");
1193 
1194   if (reachable(src)) {
1195     Assembler::andpd(dst, as_Address(src));
1196   } else {
1197     lea(rscratch, src);
1198     Assembler::andpd(dst, Address(rscratch, 0));
1199   }
1200 }
1201 
1202 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
1203   // Used in sign-masking with aligned address.
1204   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1205   assert(rscratch != noreg || always_reachable(src), "missing");
1206 
1207   if (reachable(src)) {
1208     Assembler::andps(dst, as_Address(src));
1209   } else {
1210     lea(rscratch, src);
1211     Assembler::andps(dst, Address(rscratch, 0));
1212   }
1213 }
1214 
1215 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1216   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1217 }
1218 
1219 #ifdef _LP64
1220 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
1221   assert(rscratch != noreg || always_reachable(src), "missing");
1222 
1223   if (reachable(src)) {
1224     andq(dst, as_Address(src));
1225   } else {
1226     lea(rscratch, src);
1227     andq(dst, Address(rscratch, 0));
1228   }
1229 }
1230 #endif
1231 
1232 void MacroAssembler::atomic_incl(Address counter_addr) {
1233   lock();
1234   incrementl(counter_addr);
1235 }
1236 
1237 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
1238   assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1239 
1240   if (reachable(counter_addr)) {
1241     atomic_incl(as_Address(counter_addr));
1242   } else {
1243     lea(rscratch, counter_addr);
1244     atomic_incl(Address(rscratch, 0));
1245   }
1246 }
1247 
1248 #ifdef _LP64
1249 void MacroAssembler::atomic_incq(Address counter_addr) {
1250   lock();
1251   incrementq(counter_addr);
1252 }
1253 
1254 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
1255   assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1256 
1257   if (reachable(counter_addr)) {
1258     atomic_incq(as_Address(counter_addr));
1259   } else {
1260     lea(rscratch, counter_addr);
1261     atomic_incq(Address(rscratch, 0));
1262   }
1263 }
1264 #endif
1265 
1266 // Writes to stack successive pages until offset reached to check for
1267 // stack overflow + shadow pages.  This clobbers tmp.
1268 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1269   movptr(tmp, rsp);
1270   // Bang stack for total size given plus shadow page size.
1271   // Bang one page at a time because large size can bang beyond yellow and
1272   // red zones.
1273   Label loop;
1274   bind(loop);
1275   movl(Address(tmp, (-(int)os::vm_page_size())), size );
1276   subptr(tmp, (int)os::vm_page_size());
1277   subl(size, (int)os::vm_page_size());
1278   jcc(Assembler::greater, loop);
1279 
1280   // Bang down shadow pages too.
1281   // At this point, (tmp-0) is the last address touched, so don't
1282   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1283   // was post-decremented.)  Skip this address by starting at i=1, and
1284   // touch a few more pages below.  N.B.  It is important to touch all
1285   // the way down including all pages in the shadow zone.
1286   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
1287     // this could be any sized move but this is can be a debugging crumb
1288     // so the bigger the better.
1289     movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
1290   }
1291 }
1292 
1293 void MacroAssembler::reserved_stack_check() {
1294   // testing if reserved zone needs to be enabled
1295   Label no_reserved_zone_enabling;
1296   Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1297   NOT_LP64(get_thread(rsi);)
1298 
1299   cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1300   jcc(Assembler::below, no_reserved_zone_enabling);
1301 
1302   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1303   jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1304   should_not_reach_here();
1305 
1306   bind(no_reserved_zone_enabling);
1307 }
1308 
1309 void MacroAssembler::c2bool(Register x) {
1310   // implements x == 0 ? 0 : 1
1311   // note: must only look at least-significant byte of x
1312   //       since C-style booleans are stored in one byte
1313   //       only! (was bug)
1314   andl(x, 0xFF);
1315   setb(Assembler::notZero, x);
1316 }
1317 
1318 // Wouldn't need if AddressLiteral version had new name
1319 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1320   Assembler::call(L, rtype);
1321 }
1322 
1323 void MacroAssembler::call(Register entry) {
1324   Assembler::call(entry);
1325 }
1326 
1327 void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
1328   assert(rscratch != noreg || always_reachable(entry), "missing");
1329 
1330   if (reachable(entry)) {
1331     Assembler::call_literal(entry.target(), entry.rspec());
1332   } else {
1333     lea(rscratch, entry);
1334     Assembler::call(rscratch);
1335   }
1336 }
1337 
1338 void MacroAssembler::ic_call(address entry, jint method_index) {
1339   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1340 #ifdef _LP64
1341   // Needs full 64-bit immediate for later patching.
1342   mov64(rax, (intptr_t)Universe::non_oop_word());
1343 #else
1344   movptr(rax, (intptr_t)Universe::non_oop_word());
1345 #endif
1346   call(AddressLiteral(entry, rh));
1347 }
1348 
1349 void MacroAssembler::emit_static_call_stub() {
1350   // Static stub relocation also tags the Method* in the code-stream.
1351   mov_metadata(rbx, (Metadata*) nullptr);  // Method is zapped till fixup time.
1352   // This is recognized as unresolved by relocs/nativeinst/ic code.
1353   jump(RuntimeAddress(pc()));
1354 }
1355 
1356 // Implementation of call_VM versions
1357 
1358 void MacroAssembler::call_VM(Register oop_result,
1359                              address entry_point,
1360                              bool check_exceptions) {
1361   Label C, E;
1362   call(C, relocInfo::none);
1363   jmp(E);
1364 
1365   bind(C);
1366   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1367   ret(0);
1368 
1369   bind(E);
1370 }
1371 
1372 void MacroAssembler::call_VM(Register oop_result,
1373                              address entry_point,
1374                              Register arg_1,
1375                              bool check_exceptions) {
1376   Label C, E;
1377   call(C, relocInfo::none);
1378   jmp(E);
1379 
1380   bind(C);
1381   pass_arg1(this, arg_1);
1382   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1383   ret(0);
1384 
1385   bind(E);
1386 }
1387 
1388 void MacroAssembler::call_VM(Register oop_result,
1389                              address entry_point,
1390                              Register arg_1,
1391                              Register arg_2,
1392                              bool check_exceptions) {
1393   Label C, E;
1394   call(C, relocInfo::none);
1395   jmp(E);
1396 
1397   bind(C);
1398 
1399   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1400 
1401   pass_arg2(this, arg_2);
1402   pass_arg1(this, arg_1);
1403   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1404   ret(0);
1405 
1406   bind(E);
1407 }
1408 
1409 void MacroAssembler::call_VM(Register oop_result,
1410                              address entry_point,
1411                              Register arg_1,
1412                              Register arg_2,
1413                              Register arg_3,
1414                              bool check_exceptions) {
1415   Label C, E;
1416   call(C, relocInfo::none);
1417   jmp(E);
1418 
1419   bind(C);
1420 
1421   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1422   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1423   pass_arg3(this, arg_3);
1424 
1425   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1426   pass_arg2(this, arg_2);
1427 
1428   pass_arg1(this, arg_1);
1429   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1430   ret(0);
1431 
1432   bind(E);
1433 }
1434 
1435 void MacroAssembler::call_VM(Register oop_result,
1436                              Register last_java_sp,
1437                              address entry_point,
1438                              int number_of_arguments,
1439                              bool check_exceptions) {
1440   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1441   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1442 }
1443 
1444 void MacroAssembler::call_VM(Register oop_result,
1445                              Register last_java_sp,
1446                              address entry_point,
1447                              Register arg_1,
1448                              bool check_exceptions) {
1449   pass_arg1(this, arg_1);
1450   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1451 }
1452 
1453 void MacroAssembler::call_VM(Register oop_result,
1454                              Register last_java_sp,
1455                              address entry_point,
1456                              Register arg_1,
1457                              Register arg_2,
1458                              bool check_exceptions) {
1459 
1460   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1461   pass_arg2(this, arg_2);
1462   pass_arg1(this, arg_1);
1463   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1464 }
1465 
1466 void MacroAssembler::call_VM(Register oop_result,
1467                              Register last_java_sp,
1468                              address entry_point,
1469                              Register arg_1,
1470                              Register arg_2,
1471                              Register arg_3,
1472                              bool check_exceptions) {
1473   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1474   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1475   pass_arg3(this, arg_3);
1476   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1477   pass_arg2(this, arg_2);
1478   pass_arg1(this, arg_1);
1479   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1480 }
1481 
1482 void MacroAssembler::super_call_VM(Register oop_result,
1483                                    Register last_java_sp,
1484                                    address entry_point,
1485                                    int number_of_arguments,
1486                                    bool check_exceptions) {
1487   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1488   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1489 }
1490 
1491 void MacroAssembler::super_call_VM(Register oop_result,
1492                                    Register last_java_sp,
1493                                    address entry_point,
1494                                    Register arg_1,
1495                                    bool check_exceptions) {
1496   pass_arg1(this, arg_1);
1497   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1498 }
1499 
1500 void MacroAssembler::super_call_VM(Register oop_result,
1501                                    Register last_java_sp,
1502                                    address entry_point,
1503                                    Register arg_1,
1504                                    Register arg_2,
1505                                    bool check_exceptions) {
1506 
1507   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1508   pass_arg2(this, arg_2);
1509   pass_arg1(this, arg_1);
1510   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1511 }
1512 
1513 void MacroAssembler::super_call_VM(Register oop_result,
1514                                    Register last_java_sp,
1515                                    address entry_point,
1516                                    Register arg_1,
1517                                    Register arg_2,
1518                                    Register arg_3,
1519                                    bool check_exceptions) {
1520   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1521   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1522   pass_arg3(this, arg_3);
1523   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1524   pass_arg2(this, arg_2);
1525   pass_arg1(this, arg_1);
1526   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1527 }
1528 
1529 void MacroAssembler::call_VM_base(Register oop_result,
1530                                   Register java_thread,
1531                                   Register last_java_sp,
1532                                   address  entry_point,
1533                                   int      number_of_arguments,
1534                                   bool     check_exceptions) {
1535   // determine java_thread register
1536   if (!java_thread->is_valid()) {
1537 #ifdef _LP64
1538     java_thread = r15_thread;
1539 #else
1540     java_thread = rdi;
1541     get_thread(java_thread);
1542 #endif // LP64
1543   }
1544   // determine last_java_sp register
1545   if (!last_java_sp->is_valid()) {
1546     last_java_sp = rsp;
1547   }
1548   // debugging support
1549   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1550   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1551 #ifdef ASSERT
1552   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1553   // r12 is the heapbase.
1554   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1555 #endif // ASSERT
1556 
1557   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1558   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1559 
1560   // push java thread (becomes first argument of C function)
1561 
1562   NOT_LP64(push(java_thread); number_of_arguments++);
1563   LP64_ONLY(mov(c_rarg0, r15_thread));
1564 
1565   // set last Java frame before call
1566   assert(last_java_sp != rbp, "can't use ebp/rbp");
1567 
1568   // Only interpreter should have to set fp
1569   set_last_Java_frame(java_thread, last_java_sp, rbp, nullptr, rscratch1);
1570 
1571   // do the call, remove parameters
1572   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1573 
1574   // restore the thread (cannot use the pushed argument since arguments
1575   // may be overwritten by C code generated by an optimizing compiler);
1576   // however can use the register value directly if it is callee saved.
1577   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1578     // rdi & rsi (also r15) are callee saved -> nothing to do
1579 #ifdef ASSERT
1580     guarantee(java_thread != rax, "change this code");
1581     push(rax);
1582     { Label L;
1583       get_thread(rax);
1584       cmpptr(java_thread, rax);
1585       jcc(Assembler::equal, L);
1586       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1587       bind(L);
1588     }
1589     pop(rax);
1590 #endif
1591   } else {
1592     get_thread(java_thread);
1593   }
1594   // reset last Java frame
1595   // Only interpreter should have to clear fp
1596   reset_last_Java_frame(java_thread, true);
1597 
1598    // C++ interp handles this in the interpreter
1599   check_and_handle_popframe(java_thread);
1600   check_and_handle_earlyret(java_thread);
1601 
1602   if (check_exceptions) {
1603     // check for pending exceptions (java_thread is set upon return)
1604     cmpptr(Address(java_thread, Thread::pending_exception_offset()), NULL_WORD);
1605 #ifndef _LP64
1606     jump_cc(Assembler::notEqual,
1607             RuntimeAddress(StubRoutines::forward_exception_entry()));
1608 #else
1609     // This used to conditionally jump to forward_exception however it is
1610     // possible if we relocate that the branch will not reach. So we must jump
1611     // around so we can always reach
1612 
1613     Label ok;
1614     jcc(Assembler::equal, ok);
1615     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1616     bind(ok);
1617 #endif // LP64
1618   }
1619 
1620   // get oop result if there is one and reset the value in the thread
1621   if (oop_result->is_valid()) {
1622     get_vm_result(oop_result, java_thread);
1623   }
1624 }
1625 
1626 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1627 
1628   // Calculate the value for last_Java_sp
1629   // somewhat subtle. call_VM does an intermediate call
1630   // which places a return address on the stack just under the
1631   // stack pointer as the user finished with it. This allows
1632   // use to retrieve last_Java_pc from last_Java_sp[-1].
1633   // On 32bit we then have to push additional args on the stack to accomplish
1634   // the actual requested call. On 64bit call_VM only can use register args
1635   // so the only extra space is the return address that call_VM created.
1636   // This hopefully explains the calculations here.
1637 
1638 #ifdef _LP64
1639   // We've pushed one address, correct last_Java_sp
1640   lea(rax, Address(rsp, wordSize));
1641 #else
1642   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1643 #endif // LP64
1644 
1645   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1646 
1647 }
1648 
1649 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1650 void MacroAssembler::call_VM_leaf0(address entry_point) {
1651   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1652 }
1653 
1654 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1655   call_VM_leaf_base(entry_point, number_of_arguments);
1656 }
1657 
1658 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1659   pass_arg0(this, arg_0);
1660   call_VM_leaf(entry_point, 1);
1661 }
1662 
1663 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1664 
1665   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1666   pass_arg1(this, arg_1);
1667   pass_arg0(this, arg_0);
1668   call_VM_leaf(entry_point, 2);
1669 }
1670 
1671 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1672   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1673   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1674   pass_arg2(this, arg_2);
1675   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1676   pass_arg1(this, arg_1);
1677   pass_arg0(this, arg_0);
1678   call_VM_leaf(entry_point, 3);
1679 }
1680 
1681 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1682   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1683   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1684   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1685   pass_arg3(this, arg_3);
1686   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1687   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1688   pass_arg2(this, arg_2);
1689   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1690   pass_arg1(this, arg_1);
1691   pass_arg0(this, arg_0);
1692   call_VM_leaf(entry_point, 3);
1693 }
1694 
1695 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1696   pass_arg0(this, arg_0);
1697   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1698 }
1699 
1700 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1701 
1702   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1703   pass_arg1(this, arg_1);
1704   pass_arg0(this, arg_0);
1705   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1706 }
1707 
1708 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1709   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1710   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1711   pass_arg2(this, arg_2);
1712   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1713   pass_arg1(this, arg_1);
1714   pass_arg0(this, arg_0);
1715   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1716 }
1717 
1718 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1719   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1720   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1721   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1722   pass_arg3(this, arg_3);
1723   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1724   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1725   pass_arg2(this, arg_2);
1726   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1727   pass_arg1(this, arg_1);
1728   pass_arg0(this, arg_0);
1729   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1730 }
1731 
1732 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1733   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1734   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1735   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1736 }
1737 
1738 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1739   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1740   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1741 }
1742 
1743 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1744 }
1745 
1746 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1747 }
1748 
1749 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1750   assert(rscratch != noreg || always_reachable(src1), "missing");
1751 
1752   if (reachable(src1)) {
1753     cmpl(as_Address(src1), imm);
1754   } else {
1755     lea(rscratch, src1);
1756     cmpl(Address(rscratch, 0), imm);
1757   }
1758 }
1759 
1760 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1761   assert(!src2.is_lval(), "use cmpptr");
1762   assert(rscratch != noreg || always_reachable(src2), "missing");
1763 
1764   if (reachable(src2)) {
1765     cmpl(src1, as_Address(src2));
1766   } else {
1767     lea(rscratch, src2);
1768     cmpl(src1, Address(rscratch, 0));
1769   }
1770 }
1771 
1772 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1773   Assembler::cmpl(src1, imm);
1774 }
1775 
1776 void MacroAssembler::cmp32(Register src1, Address src2) {
1777   Assembler::cmpl(src1, src2);
1778 }
1779 
1780 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1781   ucomisd(opr1, opr2);
1782 
1783   Label L;
1784   if (unordered_is_less) {
1785     movl(dst, -1);
1786     jcc(Assembler::parity, L);
1787     jcc(Assembler::below , L);
1788     movl(dst, 0);
1789     jcc(Assembler::equal , L);
1790     increment(dst);
1791   } else { // unordered is greater
1792     movl(dst, 1);
1793     jcc(Assembler::parity, L);
1794     jcc(Assembler::above , L);
1795     movl(dst, 0);
1796     jcc(Assembler::equal , L);
1797     decrementl(dst);
1798   }
1799   bind(L);
1800 }
1801 
1802 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1803   ucomiss(opr1, opr2);
1804 
1805   Label L;
1806   if (unordered_is_less) {
1807     movl(dst, -1);
1808     jcc(Assembler::parity, L);
1809     jcc(Assembler::below , L);
1810     movl(dst, 0);
1811     jcc(Assembler::equal , L);
1812     increment(dst);
1813   } else { // unordered is greater
1814     movl(dst, 1);
1815     jcc(Assembler::parity, L);
1816     jcc(Assembler::above , L);
1817     movl(dst, 0);
1818     jcc(Assembler::equal , L);
1819     decrementl(dst);
1820   }
1821   bind(L);
1822 }
1823 
1824 
1825 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1826   assert(rscratch != noreg || always_reachable(src1), "missing");
1827 
1828   if (reachable(src1)) {
1829     cmpb(as_Address(src1), imm);
1830   } else {
1831     lea(rscratch, src1);
1832     cmpb(Address(rscratch, 0), imm);
1833   }
1834 }
1835 
1836 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1837 #ifdef _LP64
1838   assert(rscratch != noreg || always_reachable(src2), "missing");
1839 
1840   if (src2.is_lval()) {
1841     movptr(rscratch, src2);
1842     Assembler::cmpq(src1, rscratch);
1843   } else if (reachable(src2)) {
1844     cmpq(src1, as_Address(src2));
1845   } else {
1846     lea(rscratch, src2);
1847     Assembler::cmpq(src1, Address(rscratch, 0));
1848   }
1849 #else
1850   assert(rscratch == noreg, "not needed");
1851   if (src2.is_lval()) {
1852     cmp_literal32(src1, (int32_t)src2.target(), src2.rspec());
1853   } else {
1854     cmpl(src1, as_Address(src2));
1855   }
1856 #endif // _LP64
1857 }
1858 
1859 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1860   assert(src2.is_lval(), "not a mem-mem compare");
1861 #ifdef _LP64
1862   // moves src2's literal address
1863   movptr(rscratch, src2);
1864   Assembler::cmpq(src1, rscratch);
1865 #else
1866   assert(rscratch == noreg, "not needed");
1867   cmp_literal32(src1, (int32_t)src2.target(), src2.rspec());
1868 #endif // _LP64
1869 }
1870 
1871 void MacroAssembler::cmpoop(Register src1, Register src2) {
1872   cmpptr(src1, src2);
1873 }
1874 
1875 void MacroAssembler::cmpoop(Register src1, Address src2) {
1876   cmpptr(src1, src2);
1877 }
1878 
1879 #ifdef _LP64
1880 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1881   movoop(rscratch, src2);
1882   cmpptr(src1, rscratch);
1883 }
1884 #endif
1885 
1886 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1887   assert(rscratch != noreg || always_reachable(adr), "missing");
1888 
1889   if (reachable(adr)) {
1890     lock();
1891     cmpxchgptr(reg, as_Address(adr));
1892   } else {
1893     lea(rscratch, adr);
1894     lock();
1895     cmpxchgptr(reg, Address(rscratch, 0));
1896   }
1897 }
1898 
1899 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1900   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1901 }
1902 
1903 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1904   assert(rscratch != noreg || always_reachable(src), "missing");
1905 
1906   if (reachable(src)) {
1907     Assembler::comisd(dst, as_Address(src));
1908   } else {
1909     lea(rscratch, src);
1910     Assembler::comisd(dst, Address(rscratch, 0));
1911   }
1912 }
1913 
1914 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1915   assert(rscratch != noreg || always_reachable(src), "missing");
1916 
1917   if (reachable(src)) {
1918     Assembler::comiss(dst, as_Address(src));
1919   } else {
1920     lea(rscratch, src);
1921     Assembler::comiss(dst, Address(rscratch, 0));
1922   }
1923 }
1924 
1925 
1926 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1927   assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1928 
1929   Condition negated_cond = negate_condition(cond);
1930   Label L;
1931   jcc(negated_cond, L);
1932   pushf(); // Preserve flags
1933   atomic_incl(counter_addr, rscratch);
1934   popf();
1935   bind(L);
1936 }
1937 
1938 int MacroAssembler::corrected_idivl(Register reg) {
1939   // Full implementation of Java idiv and irem; checks for
1940   // special case as described in JVM spec., p.243 & p.271.
1941   // The function returns the (pc) offset of the idivl
1942   // instruction - may be needed for implicit exceptions.
1943   //
1944   //         normal case                           special case
1945   //
1946   // input : rax,: dividend                         min_int
1947   //         reg: divisor   (may not be rax,/rdx)   -1
1948   //
1949   // output: rax,: quotient  (= rax, idiv reg)       min_int
1950   //         rdx: remainder (= rax, irem reg)       0
1951   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1952   const int min_int = 0x80000000;
1953   Label normal_case, special_case;
1954 
1955   // check for special case
1956   cmpl(rax, min_int);
1957   jcc(Assembler::notEqual, normal_case);
1958   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1959   cmpl(reg, -1);
1960   jcc(Assembler::equal, special_case);
1961 
1962   // handle normal case
1963   bind(normal_case);
1964   cdql();
1965   int idivl_offset = offset();
1966   idivl(reg);
1967 
1968   // normal and special case exit
1969   bind(special_case);
1970 
1971   return idivl_offset;
1972 }
1973 
1974 
1975 
1976 void MacroAssembler::decrementl(Register reg, int value) {
1977   if (value == min_jint) {subl(reg, value) ; return; }
1978   if (value <  0) { incrementl(reg, -value); return; }
1979   if (value == 0) {                        ; return; }
1980   if (value == 1 && UseIncDec) { decl(reg) ; return; }
1981   /* else */      { subl(reg, value)       ; return; }
1982 }
1983 
1984 void MacroAssembler::decrementl(Address dst, int value) {
1985   if (value == min_jint) {subl(dst, value) ; return; }
1986   if (value <  0) { incrementl(dst, -value); return; }
1987   if (value == 0) {                        ; return; }
1988   if (value == 1 && UseIncDec) { decl(dst) ; return; }
1989   /* else */      { subl(dst, value)       ; return; }
1990 }
1991 
1992 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1993   assert(shift_value > 0, "illegal shift value");
1994   Label _is_positive;
1995   testl (reg, reg);
1996   jcc (Assembler::positive, _is_positive);
1997   int offset = (1 << shift_value) - 1 ;
1998 
1999   if (offset == 1) {
2000     incrementl(reg);
2001   } else {
2002     addl(reg, offset);
2003   }
2004 
2005   bind (_is_positive);
2006   sarl(reg, shift_value);
2007 }
2008 
2009 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2010   assert(rscratch != noreg || always_reachable(src), "missing");
2011 
2012   if (reachable(src)) {
2013     Assembler::divsd(dst, as_Address(src));
2014   } else {
2015     lea(rscratch, src);
2016     Assembler::divsd(dst, Address(rscratch, 0));
2017   }
2018 }
2019 
2020 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2021   assert(rscratch != noreg || always_reachable(src), "missing");
2022 
2023   if (reachable(src)) {
2024     Assembler::divss(dst, as_Address(src));
2025   } else {
2026     lea(rscratch, src);
2027     Assembler::divss(dst, Address(rscratch, 0));
2028   }
2029 }
2030 
2031 void MacroAssembler::enter() {
2032   push(rbp);
2033   mov(rbp, rsp);
2034 }
2035 
2036 void MacroAssembler::post_call_nop() {
2037   if (!Continuations::enabled()) {
2038     return;
2039   }
2040   InstructionMark im(this);
2041   relocate(post_call_nop_Relocation::spec());
2042   InlineSkippedInstructionsCounter skipCounter(this);
2043   emit_int8((uint8_t)0x0f);
2044   emit_int8((uint8_t)0x1f);
2045   emit_int8((uint8_t)0x84);
2046   emit_int8((uint8_t)0x00);
2047   emit_int32(0x00);
2048 }
2049 
2050 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2051 void MacroAssembler::fat_nop() {
2052   if (UseAddressNop) {
2053     addr_nop_5();
2054   } else {
2055     emit_int8((uint8_t)0x26); // es:
2056     emit_int8((uint8_t)0x2e); // cs:
2057     emit_int8((uint8_t)0x64); // fs:
2058     emit_int8((uint8_t)0x65); // gs:
2059     emit_int8((uint8_t)0x90);
2060   }
2061 }
2062 
2063 #ifndef _LP64
2064 void MacroAssembler::fcmp(Register tmp) {
2065   fcmp(tmp, 1, true, true);
2066 }
2067 
2068 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2069   assert(!pop_right || pop_left, "usage error");
2070   if (VM_Version::supports_cmov()) {
2071     assert(tmp == noreg, "unneeded temp");
2072     if (pop_left) {
2073       fucomip(index);
2074     } else {
2075       fucomi(index);
2076     }
2077     if (pop_right) {
2078       fpop();
2079     }
2080   } else {
2081     assert(tmp != noreg, "need temp");
2082     if (pop_left) {
2083       if (pop_right) {
2084         fcompp();
2085       } else {
2086         fcomp(index);
2087       }
2088     } else {
2089       fcom(index);
2090     }
2091     // convert FPU condition into eflags condition via rax,
2092     save_rax(tmp);
2093     fwait(); fnstsw_ax();
2094     sahf();
2095     restore_rax(tmp);
2096   }
2097   // condition codes set as follows:
2098   //
2099   // CF (corresponds to C0) if x < y
2100   // PF (corresponds to C2) if unordered
2101   // ZF (corresponds to C3) if x = y
2102 }
2103 
2104 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2105   fcmp2int(dst, unordered_is_less, 1, true, true);
2106 }
2107 
2108 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2109   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2110   Label L;
2111   if (unordered_is_less) {
2112     movl(dst, -1);
2113     jcc(Assembler::parity, L);
2114     jcc(Assembler::below , L);
2115     movl(dst, 0);
2116     jcc(Assembler::equal , L);
2117     increment(dst);
2118   } else { // unordered is greater
2119     movl(dst, 1);
2120     jcc(Assembler::parity, L);
2121     jcc(Assembler::above , L);
2122     movl(dst, 0);
2123     jcc(Assembler::equal , L);
2124     decrementl(dst);
2125   }
2126   bind(L);
2127 }
2128 
2129 void MacroAssembler::fld_d(AddressLiteral src) {
2130   fld_d(as_Address(src));
2131 }
2132 
2133 void MacroAssembler::fld_s(AddressLiteral src) {
2134   fld_s(as_Address(src));
2135 }
2136 
2137 void MacroAssembler::fldcw(AddressLiteral src) {
2138   fldcw(as_Address(src));
2139 }
2140 
2141 void MacroAssembler::fpop() {
2142   ffree();
2143   fincstp();
2144 }
2145 
2146 void MacroAssembler::fremr(Register tmp) {
2147   save_rax(tmp);
2148   { Label L;
2149     bind(L);
2150     fprem();
2151     fwait(); fnstsw_ax();
2152     sahf();
2153     jcc(Assembler::parity, L);
2154   }
2155   restore_rax(tmp);
2156   // Result is in ST0.
2157   // Note: fxch & fpop to get rid of ST1
2158   // (otherwise FPU stack could overflow eventually)
2159   fxch(1);
2160   fpop();
2161 }
2162 
2163 void MacroAssembler::empty_FPU_stack() {
2164   if (VM_Version::supports_mmx()) {
2165     emms();
2166   } else {
2167     for (int i = 8; i-- > 0; ) ffree(i);
2168   }
2169 }
2170 #endif // !LP64
2171 
2172 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2173   assert(rscratch != noreg || always_reachable(src), "missing");
2174   if (reachable(src)) {
2175     Assembler::mulpd(dst, as_Address(src));
2176   } else {
2177     lea(rscratch, src);
2178     Assembler::mulpd(dst, Address(rscratch, 0));
2179   }
2180 }
2181 
2182 void MacroAssembler::load_float(Address src) {
2183 #ifdef _LP64
2184   movflt(xmm0, src);
2185 #else
2186   if (UseSSE >= 1) {
2187     movflt(xmm0, src);
2188   } else {
2189     fld_s(src);
2190   }
2191 #endif // LP64
2192 }
2193 
2194 void MacroAssembler::store_float(Address dst) {
2195 #ifdef _LP64
2196   movflt(dst, xmm0);
2197 #else
2198   if (UseSSE >= 1) {
2199     movflt(dst, xmm0);
2200   } else {
2201     fstp_s(dst);
2202   }
2203 #endif // LP64
2204 }
2205 
2206 void MacroAssembler::load_double(Address src) {
2207 #ifdef _LP64
2208   movdbl(xmm0, src);
2209 #else
2210   if (UseSSE >= 2) {
2211     movdbl(xmm0, src);
2212   } else {
2213     fld_d(src);
2214   }
2215 #endif // LP64
2216 }
2217 
2218 void MacroAssembler::store_double(Address dst) {
2219 #ifdef _LP64
2220   movdbl(dst, xmm0);
2221 #else
2222   if (UseSSE >= 2) {
2223     movdbl(dst, xmm0);
2224   } else {
2225     fstp_d(dst);
2226   }
2227 #endif // LP64
2228 }
2229 
2230 // dst = c = a * b + c
2231 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2232   Assembler::vfmadd231sd(c, a, b);
2233   if (dst != c) {
2234     movdbl(dst, c);
2235   }
2236 }
2237 
2238 // dst = c = a * b + c
2239 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2240   Assembler::vfmadd231ss(c, a, b);
2241   if (dst != c) {
2242     movflt(dst, c);
2243   }
2244 }
2245 
2246 // dst = c = a * b + c
2247 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2248   Assembler::vfmadd231pd(c, a, b, vector_len);
2249   if (dst != c) {
2250     vmovdqu(dst, c);
2251   }
2252 }
2253 
2254 // dst = c = a * b + c
2255 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2256   Assembler::vfmadd231ps(c, a, b, vector_len);
2257   if (dst != c) {
2258     vmovdqu(dst, c);
2259   }
2260 }
2261 
2262 // dst = c = a * b + c
2263 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2264   Assembler::vfmadd231pd(c, a, b, vector_len);
2265   if (dst != c) {
2266     vmovdqu(dst, c);
2267   }
2268 }
2269 
2270 // dst = c = a * b + c
2271 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2272   Assembler::vfmadd231ps(c, a, b, vector_len);
2273   if (dst != c) {
2274     vmovdqu(dst, c);
2275   }
2276 }
2277 
2278 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
2279   assert(rscratch != noreg || always_reachable(dst), "missing");
2280 
2281   if (reachable(dst)) {
2282     incrementl(as_Address(dst));
2283   } else {
2284     lea(rscratch, dst);
2285     incrementl(Address(rscratch, 0));
2286   }
2287 }
2288 
2289 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
2290   incrementl(as_Address(dst, rscratch));
2291 }
2292 
2293 void MacroAssembler::incrementl(Register reg, int value) {
2294   if (value == min_jint) {addl(reg, value) ; return; }
2295   if (value <  0) { decrementl(reg, -value); return; }
2296   if (value == 0) {                        ; return; }
2297   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2298   /* else */      { addl(reg, value)       ; return; }
2299 }
2300 
2301 void MacroAssembler::incrementl(Address dst, int value) {
2302   if (value == min_jint) {addl(dst, value) ; return; }
2303   if (value <  0) { decrementl(dst, -value); return; }
2304   if (value == 0) {                        ; return; }
2305   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2306   /* else */      { addl(dst, value)       ; return; }
2307 }
2308 
2309 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
2310   assert(rscratch != noreg || always_reachable(dst), "missing");
2311 
2312   if (reachable(dst)) {
2313     jmp_literal(dst.target(), dst.rspec());
2314   } else {
2315     lea(rscratch, dst);
2316     jmp(rscratch);
2317   }
2318 }
2319 
2320 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
2321   assert(rscratch != noreg || always_reachable(dst), "missing");
2322 
2323   if (reachable(dst)) {
2324     InstructionMark im(this);
2325     relocate(dst.reloc());
2326     const int short_size = 2;
2327     const int long_size = 6;
2328     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2329     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2330       // 0111 tttn #8-bit disp
2331       emit_int8(0x70 | cc);
2332       emit_int8((offs - short_size) & 0xFF);
2333     } else {
2334       // 0000 1111 1000 tttn #32-bit disp
2335       emit_int8(0x0F);
2336       emit_int8((unsigned char)(0x80 | cc));
2337       emit_int32(offs - long_size);
2338     }
2339   } else {
2340 #ifdef ASSERT
2341     warning("reversing conditional branch");
2342 #endif /* ASSERT */
2343     Label skip;
2344     jccb(reverse[cc], skip);
2345     lea(rscratch, dst);
2346     Assembler::jmp(rscratch);
2347     bind(skip);
2348   }
2349 }
2350 
2351 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
2352   assert(rscratch != noreg || always_reachable(src), "missing");
2353 
2354   if (reachable(src)) {
2355     Assembler::ldmxcsr(as_Address(src));
2356   } else {
2357     lea(rscratch, src);
2358     Assembler::ldmxcsr(Address(rscratch, 0));
2359   }
2360 }
2361 
2362 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2363   int off;
2364   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2365     off = offset();
2366     movsbl(dst, src); // movsxb
2367   } else {
2368     off = load_unsigned_byte(dst, src);
2369     shll(dst, 24);
2370     sarl(dst, 24);
2371   }
2372   return off;
2373 }
2374 
2375 // Note: load_signed_short used to be called load_signed_word.
2376 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2377 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2378 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2379 int MacroAssembler::load_signed_short(Register dst, Address src) {
2380   int off;
2381   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2382     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2383     // version but this is what 64bit has always done. This seems to imply
2384     // that users are only using 32bits worth.
2385     off = offset();
2386     movswl(dst, src); // movsxw
2387   } else {
2388     off = load_unsigned_short(dst, src);
2389     shll(dst, 16);
2390     sarl(dst, 16);
2391   }
2392   return off;
2393 }
2394 
2395 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2396   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2397   // and "3.9 Partial Register Penalties", p. 22).
2398   int off;
2399   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2400     off = offset();
2401     movzbl(dst, src); // movzxb
2402   } else {
2403     xorl(dst, dst);
2404     off = offset();
2405     movb(dst, src);
2406   }
2407   return off;
2408 }
2409 
2410 // Note: load_unsigned_short used to be called load_unsigned_word.
2411 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2412   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2413   // and "3.9 Partial Register Penalties", p. 22).
2414   int off;
2415   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2416     off = offset();
2417     movzwl(dst, src); // movzxw
2418   } else {
2419     xorl(dst, dst);
2420     off = offset();
2421     movw(dst, src);
2422   }
2423   return off;
2424 }
2425 
2426 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2427   switch (size_in_bytes) {
2428 #ifndef _LP64
2429   case  8:
2430     assert(dst2 != noreg, "second dest register required");
2431     movl(dst,  src);
2432     movl(dst2, src.plus_disp(BytesPerInt));
2433     break;
2434 #else
2435   case  8:  movq(dst, src); break;
2436 #endif
2437   case  4:  movl(dst, src); break;
2438   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2439   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2440   default:  ShouldNotReachHere();
2441   }
2442 }
2443 
2444 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2445   switch (size_in_bytes) {
2446 #ifndef _LP64
2447   case  8:
2448     assert(src2 != noreg, "second source register required");
2449     movl(dst,                        src);
2450     movl(dst.plus_disp(BytesPerInt), src2);
2451     break;
2452 #else
2453   case  8:  movq(dst, src); break;
2454 #endif
2455   case  4:  movl(dst, src); break;
2456   case  2:  movw(dst, src); break;
2457   case  1:  movb(dst, src); break;
2458   default:  ShouldNotReachHere();
2459   }
2460 }
2461 
2462 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
2463   assert(rscratch != noreg || always_reachable(dst), "missing");
2464 
2465   if (reachable(dst)) {
2466     movl(as_Address(dst), src);
2467   } else {
2468     lea(rscratch, dst);
2469     movl(Address(rscratch, 0), src);
2470   }
2471 }
2472 
2473 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2474   if (reachable(src)) {
2475     movl(dst, as_Address(src));
2476   } else {
2477     lea(dst, src);
2478     movl(dst, Address(dst, 0));
2479   }
2480 }
2481 
2482 // C++ bool manipulation
2483 
2484 void MacroAssembler::movbool(Register dst, Address src) {
2485   if(sizeof(bool) == 1)
2486     movb(dst, src);
2487   else if(sizeof(bool) == 2)
2488     movw(dst, src);
2489   else if(sizeof(bool) == 4)
2490     movl(dst, src);
2491   else
2492     // unsupported
2493     ShouldNotReachHere();
2494 }
2495 
2496 void MacroAssembler::movbool(Address dst, bool boolconst) {
2497   if(sizeof(bool) == 1)
2498     movb(dst, (int) boolconst);
2499   else if(sizeof(bool) == 2)
2500     movw(dst, (int) boolconst);
2501   else if(sizeof(bool) == 4)
2502     movl(dst, (int) boolconst);
2503   else
2504     // unsupported
2505     ShouldNotReachHere();
2506 }
2507 
2508 void MacroAssembler::movbool(Address dst, Register src) {
2509   if(sizeof(bool) == 1)
2510     movb(dst, src);
2511   else if(sizeof(bool) == 2)
2512     movw(dst, src);
2513   else if(sizeof(bool) == 4)
2514     movl(dst, src);
2515   else
2516     // unsupported
2517     ShouldNotReachHere();
2518 }
2519 
2520 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
2521   assert(rscratch != noreg || always_reachable(src), "missing");
2522 
2523   if (reachable(src)) {
2524     movdl(dst, as_Address(src));
2525   } else {
2526     lea(rscratch, src);
2527     movdl(dst, Address(rscratch, 0));
2528   }
2529 }
2530 
2531 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
2532   assert(rscratch != noreg || always_reachable(src), "missing");
2533 
2534   if (reachable(src)) {
2535     movq(dst, as_Address(src));
2536   } else {
2537     lea(rscratch, src);
2538     movq(dst, Address(rscratch, 0));
2539   }
2540 }
2541 
2542 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
2543   assert(rscratch != noreg || always_reachable(src), "missing");
2544 
2545   if (reachable(src)) {
2546     if (UseXmmLoadAndClearUpper) {
2547       movsd (dst, as_Address(src));
2548     } else {
2549       movlpd(dst, as_Address(src));
2550     }
2551   } else {
2552     lea(rscratch, src);
2553     if (UseXmmLoadAndClearUpper) {
2554       movsd (dst, Address(rscratch, 0));
2555     } else {
2556       movlpd(dst, Address(rscratch, 0));
2557     }
2558   }
2559 }
2560 
2561 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
2562   assert(rscratch != noreg || always_reachable(src), "missing");
2563 
2564   if (reachable(src)) {
2565     movss(dst, as_Address(src));
2566   } else {
2567     lea(rscratch, src);
2568     movss(dst, Address(rscratch, 0));
2569   }
2570 }
2571 
2572 void MacroAssembler::movptr(Register dst, Register src) {
2573   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2574 }
2575 
2576 void MacroAssembler::movptr(Register dst, Address src) {
2577   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2578 }
2579 
2580 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2581 void MacroAssembler::movptr(Register dst, intptr_t src) {
2582 #ifdef _LP64
2583   if (is_simm32(src)) {
2584     movq(dst, checked_cast<int32_t>(src));
2585   } else {
2586     mov64(dst, src);
2587   }
2588 #else
2589   movl(dst, src);
2590 #endif
2591 }
2592 
2593 void MacroAssembler::movptr(Address dst, Register src) {
2594   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2595 }
2596 
2597 void MacroAssembler::movptr(Address dst, int32_t src) {
2598   LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src));
2599 }
2600 
2601 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2602   assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2603   Assembler::movdqu(dst, src);
2604 }
2605 
2606 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2607   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2608   Assembler::movdqu(dst, src);
2609 }
2610 
2611 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2612   assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2613   Assembler::movdqu(dst, src);
2614 }
2615 
2616 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2617   assert(rscratch != noreg || always_reachable(src), "missing");
2618 
2619   if (reachable(src)) {
2620     movdqu(dst, as_Address(src));
2621   } else {
2622     lea(rscratch, src);
2623     movdqu(dst, Address(rscratch, 0));
2624   }
2625 }
2626 
2627 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2628   assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2629   Assembler::vmovdqu(dst, src);
2630 }
2631 
2632 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2633   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2634   Assembler::vmovdqu(dst, src);
2635 }
2636 
2637 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2638   assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2639   Assembler::vmovdqu(dst, src);
2640 }
2641 
2642 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2643   assert(rscratch != noreg || always_reachable(src), "missing");
2644 
2645   if (reachable(src)) {
2646     vmovdqu(dst, as_Address(src));
2647   }
2648   else {
2649     lea(rscratch, src);
2650     vmovdqu(dst, Address(rscratch, 0));
2651   }
2652 }
2653 
2654 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2655   assert(rscratch != noreg || always_reachable(src), "missing");
2656 
2657   if (vector_len == AVX_512bit) {
2658     evmovdquq(dst, src, AVX_512bit, rscratch);
2659   } else if (vector_len == AVX_256bit) {
2660     vmovdqu(dst, src, rscratch);
2661   } else {
2662     movdqu(dst, src, rscratch);
2663   }
2664 }
2665 
2666 void MacroAssembler::kmov(KRegister dst, Address src) {
2667   if (VM_Version::supports_avx512bw()) {
2668     kmovql(dst, src);
2669   } else {
2670     assert(VM_Version::supports_evex(), "");
2671     kmovwl(dst, src);
2672   }
2673 }
2674 
2675 void MacroAssembler::kmov(Address dst, KRegister src) {
2676   if (VM_Version::supports_avx512bw()) {
2677     kmovql(dst, src);
2678   } else {
2679     assert(VM_Version::supports_evex(), "");
2680     kmovwl(dst, src);
2681   }
2682 }
2683 
2684 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2685   if (VM_Version::supports_avx512bw()) {
2686     kmovql(dst, src);
2687   } else {
2688     assert(VM_Version::supports_evex(), "");
2689     kmovwl(dst, src);
2690   }
2691 }
2692 
2693 void MacroAssembler::kmov(Register dst, KRegister src) {
2694   if (VM_Version::supports_avx512bw()) {
2695     kmovql(dst, src);
2696   } else {
2697     assert(VM_Version::supports_evex(), "");
2698     kmovwl(dst, src);
2699   }
2700 }
2701 
2702 void MacroAssembler::kmov(KRegister dst, Register src) {
2703   if (VM_Version::supports_avx512bw()) {
2704     kmovql(dst, src);
2705   } else {
2706     assert(VM_Version::supports_evex(), "");
2707     kmovwl(dst, src);
2708   }
2709 }
2710 
2711 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2712   assert(rscratch != noreg || always_reachable(src), "missing");
2713 
2714   if (reachable(src)) {
2715     kmovql(dst, as_Address(src));
2716   } else {
2717     lea(rscratch, src);
2718     kmovql(dst, Address(rscratch, 0));
2719   }
2720 }
2721 
2722 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2723   assert(rscratch != noreg || always_reachable(src), "missing");
2724 
2725   if (reachable(src)) {
2726     kmovwl(dst, as_Address(src));
2727   } else {
2728     lea(rscratch, src);
2729     kmovwl(dst, Address(rscratch, 0));
2730   }
2731 }
2732 
2733 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2734                                int vector_len, Register rscratch) {
2735   assert(rscratch != noreg || always_reachable(src), "missing");
2736 
2737   if (reachable(src)) {
2738     Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2739   } else {
2740     lea(rscratch, src);
2741     Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2742   }
2743 }
2744 
2745 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2746                                int vector_len, Register rscratch) {
2747   assert(rscratch != noreg || always_reachable(src), "missing");
2748 
2749   if (reachable(src)) {
2750     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2751   } else {
2752     lea(rscratch, src);
2753     Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2754   }
2755 }
2756 
2757 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2758   assert(rscratch != noreg || always_reachable(src), "missing");
2759 
2760   if (reachable(src)) {
2761     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2762   } else {
2763     lea(rscratch, src);
2764     Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2765   }
2766 }
2767 
2768 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2769   assert(rscratch != noreg || always_reachable(src), "missing");
2770 
2771   if (reachable(src)) {
2772     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2773   } else {
2774     lea(rscratch, src);
2775     Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2776   }
2777 }
2778 
2779 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2780   assert(rscratch != noreg || always_reachable(src), "missing");
2781 
2782   if (reachable(src)) {
2783     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2784   } else {
2785     lea(rscratch, src);
2786     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2787   }
2788 }
2789 
2790 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2791   assert(rscratch != noreg || always_reachable(src), "missing");
2792 
2793   if (reachable(src)) {
2794     Assembler::movdqa(dst, as_Address(src));
2795   } else {
2796     lea(rscratch, src);
2797     Assembler::movdqa(dst, Address(rscratch, 0));
2798   }
2799 }
2800 
2801 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2802   assert(rscratch != noreg || always_reachable(src), "missing");
2803 
2804   if (reachable(src)) {
2805     Assembler::movsd(dst, as_Address(src));
2806   } else {
2807     lea(rscratch, src);
2808     Assembler::movsd(dst, Address(rscratch, 0));
2809   }
2810 }
2811 
2812 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2813   assert(rscratch != noreg || always_reachable(src), "missing");
2814 
2815   if (reachable(src)) {
2816     Assembler::movss(dst, as_Address(src));
2817   } else {
2818     lea(rscratch, src);
2819     Assembler::movss(dst, Address(rscratch, 0));
2820   }
2821 }
2822 
2823 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2824   assert(rscratch != noreg || always_reachable(src), "missing");
2825 
2826   if (reachable(src)) {
2827     Assembler::movddup(dst, as_Address(src));
2828   } else {
2829     lea(rscratch, src);
2830     Assembler::movddup(dst, Address(rscratch, 0));
2831   }
2832 }
2833 
2834 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2835   assert(rscratch != noreg || always_reachable(src), "missing");
2836 
2837   if (reachable(src)) {
2838     Assembler::vmovddup(dst, as_Address(src), vector_len);
2839   } else {
2840     lea(rscratch, src);
2841     Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2842   }
2843 }
2844 
2845 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2846   assert(rscratch != noreg || always_reachable(src), "missing");
2847 
2848   if (reachable(src)) {
2849     Assembler::mulsd(dst, as_Address(src));
2850   } else {
2851     lea(rscratch, src);
2852     Assembler::mulsd(dst, Address(rscratch, 0));
2853   }
2854 }
2855 
2856 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2857   assert(rscratch != noreg || always_reachable(src), "missing");
2858 
2859   if (reachable(src)) {
2860     Assembler::mulss(dst, as_Address(src));
2861   } else {
2862     lea(rscratch, src);
2863     Assembler::mulss(dst, Address(rscratch, 0));
2864   }
2865 }
2866 
2867 void MacroAssembler::null_check(Register reg, int offset) {
2868   if (needs_explicit_null_check(offset)) {
2869     // provoke OS null exception if reg is null by
2870     // accessing M[reg] w/o changing any (non-CC) registers
2871     // NOTE: cmpl is plenty here to provoke a segv
2872     cmpptr(rax, Address(reg, 0));
2873     // Note: should probably use testl(rax, Address(reg, 0));
2874     //       may be shorter code (however, this version of
2875     //       testl needs to be implemented first)
2876   } else {
2877     // nothing to do, (later) access of M[reg + offset]
2878     // will provoke OS null exception if reg is null
2879   }
2880 }
2881 
2882 void MacroAssembler::os_breakpoint() {
2883   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2884   // (e.g., MSVC can't call ps() otherwise)
2885   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2886 }
2887 
2888 void MacroAssembler::unimplemented(const char* what) {
2889   const char* buf = nullptr;
2890   {
2891     ResourceMark rm;
2892     stringStream ss;
2893     ss.print("unimplemented: %s", what);
2894     buf = code_string(ss.as_string());
2895   }
2896   stop(buf);
2897 }
2898 
2899 #ifdef _LP64
2900 #define XSTATE_BV 0x200
2901 #endif
2902 
2903 void MacroAssembler::pop_CPU_state() {
2904   pop_FPU_state();
2905   pop_IU_state();
2906 }
2907 
2908 void MacroAssembler::pop_FPU_state() {
2909 #ifndef _LP64
2910   frstor(Address(rsp, 0));
2911 #else
2912   fxrstor(Address(rsp, 0));
2913 #endif
2914   addptr(rsp, FPUStateSizeInWords * wordSize);
2915 }
2916 
2917 void MacroAssembler::pop_IU_state() {
2918   popa();
2919   LP64_ONLY(addq(rsp, 8));
2920   popf();
2921 }
2922 
2923 // Save Integer and Float state
2924 // Warning: Stack must be 16 byte aligned (64bit)
2925 void MacroAssembler::push_CPU_state() {
2926   push_IU_state();
2927   push_FPU_state();
2928 }
2929 
2930 void MacroAssembler::push_FPU_state() {
2931   subptr(rsp, FPUStateSizeInWords * wordSize);
2932 #ifndef _LP64
2933   fnsave(Address(rsp, 0));
2934   fwait();
2935 #else
2936   fxsave(Address(rsp, 0));
2937 #endif // LP64
2938 }
2939 
2940 void MacroAssembler::push_IU_state() {
2941   // Push flags first because pusha kills them
2942   pushf();
2943   // Make sure rsp stays 16-byte aligned
2944   LP64_ONLY(subq(rsp, 8));
2945   pusha();
2946 }
2947 
2948 void MacroAssembler::push_cont_fastpath() {
2949   if (!Continuations::enabled()) return;
2950 
2951 #ifndef _LP64
2952   Register rthread = rax;
2953   Register rrealsp = rbx;
2954   push(rthread);
2955   push(rrealsp);
2956 
2957   get_thread(rthread);
2958 
2959   // The code below wants the original RSP.
2960   // Move it back after the pushes above.
2961   movptr(rrealsp, rsp);
2962   addptr(rrealsp, 2*wordSize);
2963 #else
2964   Register rthread = r15_thread;
2965   Register rrealsp = rsp;
2966 #endif
2967 
2968   Label done;
2969   cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
2970   jccb(Assembler::belowEqual, done);
2971   movptr(Address(rthread, JavaThread::cont_fastpath_offset()), rrealsp);
2972   bind(done);
2973 
2974 #ifndef _LP64
2975   pop(rrealsp);
2976   pop(rthread);
2977 #endif
2978 }
2979 
2980 void MacroAssembler::pop_cont_fastpath() {
2981   if (!Continuations::enabled()) return;
2982 
2983 #ifndef _LP64
2984   Register rthread = rax;
2985   Register rrealsp = rbx;
2986   push(rthread);
2987   push(rrealsp);
2988 
2989   get_thread(rthread);
2990 
2991   // The code below wants the original RSP.
2992   // Move it back after the pushes above.
2993   movptr(rrealsp, rsp);
2994   addptr(rrealsp, 2*wordSize);
2995 #else
2996   Register rthread = r15_thread;
2997   Register rrealsp = rsp;
2998 #endif
2999 
3000   Label done;
3001   cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
3002   jccb(Assembler::below, done);
3003   movptr(Address(rthread, JavaThread::cont_fastpath_offset()), 0);
3004   bind(done);
3005 
3006 #ifndef _LP64
3007   pop(rrealsp);
3008   pop(rthread);
3009 #endif
3010 }
3011 
3012 void MacroAssembler::inc_held_monitor_count() {
3013 #ifndef _LP64
3014   Register thread = rax;
3015   push(thread);
3016   get_thread(thread);
3017   incrementl(Address(thread, JavaThread::held_monitor_count_offset()));
3018   pop(thread);
3019 #else // LP64
3020   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
3021 #endif
3022 }
3023 
3024 void MacroAssembler::dec_held_monitor_count() {
3025 #ifndef _LP64
3026   Register thread = rax;
3027   push(thread);
3028   get_thread(thread);
3029   decrementl(Address(thread, JavaThread::held_monitor_count_offset()));
3030   pop(thread);
3031 #else // LP64
3032   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
3033 #endif
3034 }
3035 
3036 #ifdef ASSERT
3037 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
3038 #ifdef _LP64
3039   Label no_cont;
3040   movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
3041   testl(cont, cont);
3042   jcc(Assembler::zero, no_cont);
3043   stop(name);
3044   bind(no_cont);
3045 #else
3046   Unimplemented();
3047 #endif
3048 }
3049 #endif
3050 
3051 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3052   if (!java_thread->is_valid()) {
3053     java_thread = rdi;
3054     get_thread(java_thread);
3055   }
3056   // we must set sp to zero to clear frame
3057   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3058   // must clear fp, so that compiled frames are not confused; it is
3059   // possible that we need it only for debugging
3060   if (clear_fp) {
3061     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3062   }
3063   // Always clear the pc because it could have been set by make_walkable()
3064   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3065   vzeroupper();
3066 }
3067 
3068 void MacroAssembler::restore_rax(Register tmp) {
3069   if (tmp == noreg) pop(rax);
3070   else if (tmp != rax) mov(rax, tmp);
3071 }
3072 
3073 void MacroAssembler::round_to(Register reg, int modulus) {
3074   addptr(reg, modulus - 1);
3075   andptr(reg, -modulus);
3076 }
3077 
3078 void MacroAssembler::save_rax(Register tmp) {
3079   if (tmp == noreg) push(rax);
3080   else if (tmp != rax) mov(tmp, rax);
3081 }
3082 
3083 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
3084   if (at_return) {
3085     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
3086     // we may safely use rsp instead to perform the stack watermark check.
3087     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
3088     jcc(Assembler::above, slow_path);
3089     return;
3090   }
3091   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
3092   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3093 }
3094 
3095 // Calls to C land
3096 //
3097 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3098 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3099 // has to be reset to 0. This is required to allow proper stack traversal.
3100 void MacroAssembler::set_last_Java_frame(Register java_thread,
3101                                          Register last_java_sp,
3102                                          Register last_java_fp,
3103                                          address  last_java_pc,
3104                                          Register rscratch) {
3105   vzeroupper();
3106   // determine java_thread register
3107   if (!java_thread->is_valid()) {
3108     java_thread = rdi;
3109     get_thread(java_thread);
3110   }
3111   // determine last_java_sp register
3112   if (!last_java_sp->is_valid()) {
3113     last_java_sp = rsp;
3114   }
3115   // last_java_fp is optional
3116   if (last_java_fp->is_valid()) {
3117     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3118   }
3119   // last_java_pc is optional
3120   if (last_java_pc != nullptr) {
3121     Address java_pc(java_thread,
3122                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
3123     lea(java_pc, InternalAddress(last_java_pc), rscratch);
3124   }
3125   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3126 }
3127 
3128 void MacroAssembler::shlptr(Register dst, int imm8) {
3129   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3130 }
3131 
3132 void MacroAssembler::shrptr(Register dst, int imm8) {
3133   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3134 }
3135 
3136 void MacroAssembler::sign_extend_byte(Register reg) {
3137   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3138     movsbl(reg, reg); // movsxb
3139   } else {
3140     shll(reg, 24);
3141     sarl(reg, 24);
3142   }
3143 }
3144 
3145 void MacroAssembler::sign_extend_short(Register reg) {
3146   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3147     movswl(reg, reg); // movsxw
3148   } else {
3149     shll(reg, 16);
3150     sarl(reg, 16);
3151   }
3152 }
3153 
3154 void MacroAssembler::testl(Address dst, int32_t imm32) {
3155   if (imm32 >= 0 && is8bit(imm32)) {
3156     testb(dst, imm32);
3157   } else {
3158     Assembler::testl(dst, imm32);
3159   }
3160 }
3161 
3162 void MacroAssembler::testl(Register dst, int32_t imm32) {
3163   if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
3164     testb(dst, imm32);
3165   } else {
3166     Assembler::testl(dst, imm32);
3167   }
3168 }
3169 
3170 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3171   assert(always_reachable(src), "Address should be reachable");
3172   testl(dst, as_Address(src));
3173 }
3174 
3175 #ifdef _LP64
3176 
3177 void MacroAssembler::testq(Address dst, int32_t imm32) {
3178   if (imm32 >= 0) {
3179     testl(dst, imm32);
3180   } else {
3181     Assembler::testq(dst, imm32);
3182   }
3183 }
3184 
3185 void MacroAssembler::testq(Register dst, int32_t imm32) {
3186   if (imm32 >= 0) {
3187     testl(dst, imm32);
3188   } else {
3189     Assembler::testq(dst, imm32);
3190   }
3191 }
3192 
3193 #endif
3194 
3195 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3196   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3197   Assembler::pcmpeqb(dst, src);
3198 }
3199 
3200 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3201   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3202   Assembler::pcmpeqw(dst, src);
3203 }
3204 
3205 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3206   assert((dst->encoding() < 16),"XMM register should be 0-15");
3207   Assembler::pcmpestri(dst, src, imm8);
3208 }
3209 
3210 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3211   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3212   Assembler::pcmpestri(dst, src, imm8);
3213 }
3214 
3215 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3216   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3217   Assembler::pmovzxbw(dst, src);
3218 }
3219 
3220 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3221   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3222   Assembler::pmovzxbw(dst, src);
3223 }
3224 
3225 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3226   assert((src->encoding() < 16),"XMM register should be 0-15");
3227   Assembler::pmovmskb(dst, src);
3228 }
3229 
3230 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3231   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3232   Assembler::ptest(dst, src);
3233 }
3234 
3235 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3236   assert(rscratch != noreg || always_reachable(src), "missing");
3237 
3238   if (reachable(src)) {
3239     Assembler::sqrtss(dst, as_Address(src));
3240   } else {
3241     lea(rscratch, src);
3242     Assembler::sqrtss(dst, Address(rscratch, 0));
3243   }
3244 }
3245 
3246 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3247   assert(rscratch != noreg || always_reachable(src), "missing");
3248 
3249   if (reachable(src)) {
3250     Assembler::subsd(dst, as_Address(src));
3251   } else {
3252     lea(rscratch, src);
3253     Assembler::subsd(dst, Address(rscratch, 0));
3254   }
3255 }
3256 
3257 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
3258   assert(rscratch != noreg || always_reachable(src), "missing");
3259 
3260   if (reachable(src)) {
3261     Assembler::roundsd(dst, as_Address(src), rmode);
3262   } else {
3263     lea(rscratch, src);
3264     Assembler::roundsd(dst, Address(rscratch, 0), rmode);
3265   }
3266 }
3267 
3268 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3269   assert(rscratch != noreg || always_reachable(src), "missing");
3270 
3271   if (reachable(src)) {
3272     Assembler::subss(dst, as_Address(src));
3273   } else {
3274     lea(rscratch, src);
3275     Assembler::subss(dst, Address(rscratch, 0));
3276   }
3277 }
3278 
3279 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3280   assert(rscratch != noreg || always_reachable(src), "missing");
3281 
3282   if (reachable(src)) {
3283     Assembler::ucomisd(dst, as_Address(src));
3284   } else {
3285     lea(rscratch, src);
3286     Assembler::ucomisd(dst, Address(rscratch, 0));
3287   }
3288 }
3289 
3290 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3291   assert(rscratch != noreg || always_reachable(src), "missing");
3292 
3293   if (reachable(src)) {
3294     Assembler::ucomiss(dst, as_Address(src));
3295   } else {
3296     lea(rscratch, src);
3297     Assembler::ucomiss(dst, Address(rscratch, 0));
3298   }
3299 }
3300 
3301 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3302   assert(rscratch != noreg || always_reachable(src), "missing");
3303 
3304   // Used in sign-bit flipping with aligned address.
3305   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3306   if (reachable(src)) {
3307     Assembler::xorpd(dst, as_Address(src));
3308   } else {
3309     lea(rscratch, src);
3310     Assembler::xorpd(dst, Address(rscratch, 0));
3311   }
3312 }
3313 
3314 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3315   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3316     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3317   }
3318   else {
3319     Assembler::xorpd(dst, src);
3320   }
3321 }
3322 
3323 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3324   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3325     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3326   } else {
3327     Assembler::xorps(dst, src);
3328   }
3329 }
3330 
3331 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
3332   assert(rscratch != noreg || always_reachable(src), "missing");
3333 
3334   // Used in sign-bit flipping with aligned address.
3335   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3336   if (reachable(src)) {
3337     Assembler::xorps(dst, as_Address(src));
3338   } else {
3339     lea(rscratch, src);
3340     Assembler::xorps(dst, Address(rscratch, 0));
3341   }
3342 }
3343 
3344 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
3345   assert(rscratch != noreg || always_reachable(src), "missing");
3346 
3347   // Used in sign-bit flipping with aligned address.
3348   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3349   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3350   if (reachable(src)) {
3351     Assembler::pshufb(dst, as_Address(src));
3352   } else {
3353     lea(rscratch, src);
3354     Assembler::pshufb(dst, Address(rscratch, 0));
3355   }
3356 }
3357 
3358 // AVX 3-operands instructions
3359 
3360 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3361   assert(rscratch != noreg || always_reachable(src), "missing");
3362 
3363   if (reachable(src)) {
3364     vaddsd(dst, nds, as_Address(src));
3365   } else {
3366     lea(rscratch, src);
3367     vaddsd(dst, nds, Address(rscratch, 0));
3368   }
3369 }
3370 
3371 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3372   assert(rscratch != noreg || always_reachable(src), "missing");
3373 
3374   if (reachable(src)) {
3375     vaddss(dst, nds, as_Address(src));
3376   } else {
3377     lea(rscratch, src);
3378     vaddss(dst, nds, Address(rscratch, 0));
3379   }
3380 }
3381 
3382 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3383   assert(UseAVX > 0, "requires some form of AVX");
3384   assert(rscratch != noreg || always_reachable(src), "missing");
3385 
3386   if (reachable(src)) {
3387     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3388   } else {
3389     lea(rscratch, src);
3390     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3391   }
3392 }
3393 
3394 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3395   assert(UseAVX > 0, "requires some form of AVX");
3396   assert(rscratch != noreg || always_reachable(src), "missing");
3397 
3398   if (reachable(src)) {
3399     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3400   } else {
3401     lea(rscratch, src);
3402     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3403   }
3404 }
3405 
3406 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
3407   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3408   assert(rscratch != noreg || always_reachable(negate_field), "missing");
3409 
3410   vandps(dst, nds, negate_field, vector_len, rscratch);
3411 }
3412 
3413 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
3414   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3415   assert(rscratch != noreg || always_reachable(negate_field), "missing");
3416 
3417   vandpd(dst, nds, negate_field, vector_len, rscratch);
3418 }
3419 
3420 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3421   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3422   Assembler::vpaddb(dst, nds, src, vector_len);
3423 }
3424 
3425 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3426   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3427   Assembler::vpaddb(dst, nds, src, vector_len);
3428 }
3429 
3430 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3431   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3432   Assembler::vpaddw(dst, nds, src, vector_len);
3433 }
3434 
3435 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3436   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3437   Assembler::vpaddw(dst, nds, src, vector_len);
3438 }
3439 
3440 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3441   assert(rscratch != noreg || always_reachable(src), "missing");
3442 
3443   if (reachable(src)) {
3444     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3445   } else {
3446     lea(rscratch, src);
3447     Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
3448   }
3449 }
3450 
3451 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3452   assert(rscratch != noreg || always_reachable(src), "missing");
3453 
3454   if (reachable(src)) {
3455     Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
3456   } else {
3457     lea(rscratch, src);
3458     Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
3459   }
3460 }
3461 
3462 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3463   assert(rscratch != noreg || always_reachable(src), "missing");
3464 
3465   if (reachable(src)) {
3466     Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
3467   } else {
3468     lea(rscratch, src);
3469     Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
3470   }
3471 }
3472 
3473 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3474   assert(rscratch != noreg || always_reachable(src), "missing");
3475 
3476   if (reachable(src)) {
3477     Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3478   } else {
3479     lea(rscratch, src);
3480     Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3481   }
3482 }
3483 
3484 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3485   assert(rscratch != noreg || always_reachable(src), "missing");
3486 
3487   if (reachable(src)) {
3488     Assembler::vbroadcastss(dst, as_Address(src), vector_len);
3489   } else {
3490     lea(rscratch, src);
3491     Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
3492   }
3493 }
3494 
3495 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3496   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3497   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3498 }
3499 
3500 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3501   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3502   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3503 }
3504 
3505 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3506   assert(rscratch != noreg || always_reachable(src), "missing");
3507 
3508   if (reachable(src)) {
3509     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3510   } else {
3511     lea(rscratch, src);
3512     Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3513   }
3514 }
3515 
3516 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3517                              int comparison, bool is_signed, int vector_len, Register rscratch) {
3518   assert(rscratch != noreg || always_reachable(src), "missing");
3519 
3520   if (reachable(src)) {
3521     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3522   } else {
3523     lea(rscratch, src);
3524     Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3525   }
3526 }
3527 
3528 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3529                              int comparison, bool is_signed, int vector_len, Register rscratch) {
3530   assert(rscratch != noreg || always_reachable(src), "missing");
3531 
3532   if (reachable(src)) {
3533     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3534   } else {
3535     lea(rscratch, src);
3536     Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3537   }
3538 }
3539 
3540 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3541                              int comparison, bool is_signed, int vector_len, Register rscratch) {
3542   assert(rscratch != noreg || always_reachable(src), "missing");
3543 
3544   if (reachable(src)) {
3545     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3546   } else {
3547     lea(rscratch, src);
3548     Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3549   }
3550 }
3551 
3552 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3553                              int comparison, bool is_signed, int vector_len, Register rscratch) {
3554   assert(rscratch != noreg || always_reachable(src), "missing");
3555 
3556   if (reachable(src)) {
3557     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3558   } else {
3559     lea(rscratch, src);
3560     Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3561   }
3562 }
3563 
3564 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3565   if (width == Assembler::Q) {
3566     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3567   } else {
3568     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3569   }
3570 }
3571 
3572 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3573   int eq_cond_enc = 0x29;
3574   int gt_cond_enc = 0x37;
3575   if (width != Assembler::Q) {
3576     eq_cond_enc = 0x74 + width;
3577     gt_cond_enc = 0x64 + width;
3578   }
3579   switch (cond) {
3580   case eq:
3581     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3582     break;
3583   case neq:
3584     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3585     vallones(xtmp, vector_len);
3586     vpxor(dst, xtmp, dst, vector_len);
3587     break;
3588   case le:
3589     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3590     vallones(xtmp, vector_len);
3591     vpxor(dst, xtmp, dst, vector_len);
3592     break;
3593   case nlt:
3594     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3595     vallones(xtmp, vector_len);
3596     vpxor(dst, xtmp, dst, vector_len);
3597     break;
3598   case lt:
3599     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3600     break;
3601   case nle:
3602     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3603     break;
3604   default:
3605     assert(false, "Should not reach here");
3606   }
3607 }
3608 
3609 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3610   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3611   Assembler::vpmovzxbw(dst, src, vector_len);
3612 }
3613 
3614 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3615   assert((src->encoding() < 16),"XMM register should be 0-15");
3616   Assembler::vpmovmskb(dst, src, vector_len);
3617 }
3618 
3619 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3620   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3621   Assembler::vpmullw(dst, nds, src, vector_len);
3622 }
3623 
3624 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3625   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3626   Assembler::vpmullw(dst, nds, src, vector_len);
3627 }
3628 
3629 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3630   assert((UseAVX > 0), "AVX support is needed");
3631   assert(rscratch != noreg || always_reachable(src), "missing");
3632 
3633   if (reachable(src)) {
3634     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3635   } else {
3636     lea(rscratch, src);
3637     Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3638   }
3639 }
3640 
3641 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3642   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3643   Assembler::vpsubb(dst, nds, src, vector_len);
3644 }
3645 
3646 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3647   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3648   Assembler::vpsubb(dst, nds, src, vector_len);
3649 }
3650 
3651 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3652   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3653   Assembler::vpsubw(dst, nds, src, vector_len);
3654 }
3655 
3656 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3657   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3658   Assembler::vpsubw(dst, nds, src, vector_len);
3659 }
3660 
3661 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3662   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3663   Assembler::vpsraw(dst, nds, shift, vector_len);
3664 }
3665 
3666 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3667   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3668   Assembler::vpsraw(dst, nds, shift, vector_len);
3669 }
3670 
3671 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3672   assert(UseAVX > 2,"");
3673   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3674      vector_len = 2;
3675   }
3676   Assembler::evpsraq(dst, nds, shift, vector_len);
3677 }
3678 
3679 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3680   assert(UseAVX > 2,"");
3681   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3682      vector_len = 2;
3683   }
3684   Assembler::evpsraq(dst, nds, shift, vector_len);
3685 }
3686 
3687 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3688   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3689   Assembler::vpsrlw(dst, nds, shift, vector_len);
3690 }
3691 
3692 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3693   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3694   Assembler::vpsrlw(dst, nds, shift, vector_len);
3695 }
3696 
3697 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3698   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3699   Assembler::vpsllw(dst, nds, shift, vector_len);
3700 }
3701 
3702 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3703   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3704   Assembler::vpsllw(dst, nds, shift, vector_len);
3705 }
3706 
3707 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3708   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3709   Assembler::vptest(dst, src);
3710 }
3711 
3712 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3713   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3714   Assembler::punpcklbw(dst, src);
3715 }
3716 
3717 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3718   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3719   Assembler::pshufd(dst, src, mode);
3720 }
3721 
3722 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3723   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3724   Assembler::pshuflw(dst, src, mode);
3725 }
3726 
3727 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3728   assert(rscratch != noreg || always_reachable(src), "missing");
3729 
3730   if (reachable(src)) {
3731     vandpd(dst, nds, as_Address(src), vector_len);
3732   } else {
3733     lea(rscratch, src);
3734     vandpd(dst, nds, Address(rscratch, 0), vector_len);
3735   }
3736 }
3737 
3738 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3739   assert(rscratch != noreg || always_reachable(src), "missing");
3740 
3741   if (reachable(src)) {
3742     vandps(dst, nds, as_Address(src), vector_len);
3743   } else {
3744     lea(rscratch, src);
3745     vandps(dst, nds, Address(rscratch, 0), vector_len);
3746   }
3747 }
3748 
3749 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3750                             bool merge, int vector_len, Register rscratch) {
3751   assert(rscratch != noreg || always_reachable(src), "missing");
3752 
3753   if (reachable(src)) {
3754     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3755   } else {
3756     lea(rscratch, src);
3757     Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3758   }
3759 }
3760 
3761 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3762   assert(rscratch != noreg || always_reachable(src), "missing");
3763 
3764   if (reachable(src)) {
3765     vdivsd(dst, nds, as_Address(src));
3766   } else {
3767     lea(rscratch, src);
3768     vdivsd(dst, nds, Address(rscratch, 0));
3769   }
3770 }
3771 
3772 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3773   assert(rscratch != noreg || always_reachable(src), "missing");
3774 
3775   if (reachable(src)) {
3776     vdivss(dst, nds, as_Address(src));
3777   } else {
3778     lea(rscratch, src);
3779     vdivss(dst, nds, Address(rscratch, 0));
3780   }
3781 }
3782 
3783 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3784   assert(rscratch != noreg || always_reachable(src), "missing");
3785 
3786   if (reachable(src)) {
3787     vmulsd(dst, nds, as_Address(src));
3788   } else {
3789     lea(rscratch, src);
3790     vmulsd(dst, nds, Address(rscratch, 0));
3791   }
3792 }
3793 
3794 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3795   assert(rscratch != noreg || always_reachable(src), "missing");
3796 
3797   if (reachable(src)) {
3798     vmulss(dst, nds, as_Address(src));
3799   } else {
3800     lea(rscratch, src);
3801     vmulss(dst, nds, Address(rscratch, 0));
3802   }
3803 }
3804 
3805 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3806   assert(rscratch != noreg || always_reachable(src), "missing");
3807 
3808   if (reachable(src)) {
3809     vsubsd(dst, nds, as_Address(src));
3810   } else {
3811     lea(rscratch, src);
3812     vsubsd(dst, nds, Address(rscratch, 0));
3813   }
3814 }
3815 
3816 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3817   assert(rscratch != noreg || always_reachable(src), "missing");
3818 
3819   if (reachable(src)) {
3820     vsubss(dst, nds, as_Address(src));
3821   } else {
3822     lea(rscratch, src);
3823     vsubss(dst, nds, Address(rscratch, 0));
3824   }
3825 }
3826 
3827 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3828   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3829   assert(rscratch != noreg || always_reachable(src), "missing");
3830 
3831   vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3832 }
3833 
3834 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3835   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3836   assert(rscratch != noreg || always_reachable(src), "missing");
3837 
3838   vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3839 }
3840 
3841 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3842   assert(rscratch != noreg || always_reachable(src), "missing");
3843 
3844   if (reachable(src)) {
3845     vxorpd(dst, nds, as_Address(src), vector_len);
3846   } else {
3847     lea(rscratch, src);
3848     vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3849   }
3850 }
3851 
3852 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3853   assert(rscratch != noreg || always_reachable(src), "missing");
3854 
3855   if (reachable(src)) {
3856     vxorps(dst, nds, as_Address(src), vector_len);
3857   } else {
3858     lea(rscratch, src);
3859     vxorps(dst, nds, Address(rscratch, 0), vector_len);
3860   }
3861 }
3862 
3863 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3864   assert(rscratch != noreg || always_reachable(src), "missing");
3865 
3866   if (UseAVX > 1 || (vector_len < 1)) {
3867     if (reachable(src)) {
3868       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3869     } else {
3870       lea(rscratch, src);
3871       Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3872     }
3873   } else {
3874     MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3875   }
3876 }
3877 
3878 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3879   assert(rscratch != noreg || always_reachable(src), "missing");
3880 
3881   if (reachable(src)) {
3882     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3883   } else {
3884     lea(rscratch, src);
3885     Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3886   }
3887 }
3888 
3889 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3890   const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3891   STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3892   // The inverted mask is sign-extended
3893   andptr(possibly_non_local, inverted_mask);
3894 }
3895 
3896 void MacroAssembler::resolve_jobject(Register value,
3897                                      Register thread,
3898                                      Register tmp) {
3899   assert_different_registers(value, thread, tmp);
3900   Label done, tagged, weak_tagged;
3901   testptr(value, value);
3902   jcc(Assembler::zero, done);           // Use null as-is.
3903   testptr(value, JNIHandles::tag_mask); // Test for tag.
3904   jcc(Assembler::notZero, tagged);
3905 
3906   // Resolve local handle
3907   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp, thread);
3908   verify_oop(value);
3909   jmp(done);
3910 
3911   bind(tagged);
3912   testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3913   jcc(Assembler::notZero, weak_tagged);
3914 
3915   // Resolve global handle
3916   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread);
3917   verify_oop(value);
3918   jmp(done);
3919 
3920   bind(weak_tagged);
3921   // Resolve jweak.
3922   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3923                  value, Address(value, -JNIHandles::TypeTag::weak_global), tmp, thread);
3924   verify_oop(value);
3925 
3926   bind(done);
3927 }
3928 
3929 void MacroAssembler::resolve_global_jobject(Register value,
3930                                             Register thread,
3931                                             Register tmp) {
3932   assert_different_registers(value, thread, tmp);
3933   Label done;
3934 
3935   testptr(value, value);
3936   jcc(Assembler::zero, done);           // Use null as-is.
3937 
3938 #ifdef ASSERT
3939   {
3940     Label valid_global_tag;
3941     testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
3942     jcc(Assembler::notZero, valid_global_tag);
3943     stop("non global jobject using resolve_global_jobject");
3944     bind(valid_global_tag);
3945   }
3946 #endif
3947 
3948   // Resolve global handle
3949   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread);
3950   verify_oop(value);
3951 
3952   bind(done);
3953 }
3954 
3955 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3956   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3957 }
3958 
3959 // Force generation of a 4 byte immediate value even if it fits into 8bit
3960 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3961   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3962 }
3963 
3964 void MacroAssembler::subptr(Register dst, Register src) {
3965   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3966 }
3967 
3968 // C++ bool manipulation
3969 void MacroAssembler::testbool(Register dst) {
3970   if(sizeof(bool) == 1)
3971     testb(dst, 0xff);
3972   else if(sizeof(bool) == 2) {
3973     // testw implementation needed for two byte bools
3974     ShouldNotReachHere();
3975   } else if(sizeof(bool) == 4)
3976     testl(dst, dst);
3977   else
3978     // unsupported
3979     ShouldNotReachHere();
3980 }
3981 
3982 void MacroAssembler::testptr(Register dst, Register src) {
3983   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3984 }
3985 
3986 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3987 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3988                                    Register var_size_in_bytes,
3989                                    int con_size_in_bytes,
3990                                    Register t1,
3991                                    Register t2,
3992                                    Label& slow_case) {
3993   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3994   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3995 }
3996 
3997 RegSet MacroAssembler::call_clobbered_gp_registers() {
3998   RegSet regs;
3999 #ifdef _LP64
4000   regs += RegSet::of(rax, rcx, rdx);
4001 #ifndef WINDOWS
4002   regs += RegSet::of(rsi, rdi);
4003 #endif
4004   regs += RegSet::range(r8, r11);
4005 #else
4006   regs += RegSet::of(rax, rcx, rdx);
4007 #endif
4008   return regs;
4009 }
4010 
4011 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
4012   int num_xmm_registers = XMMRegister::available_xmm_registers();
4013 #if defined(WINDOWS) && defined(_LP64)
4014   XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
4015   if (num_xmm_registers > 16) {
4016      result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
4017   }
4018   return result;
4019 #else
4020   return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
4021 #endif
4022 }
4023 
4024 static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor
4025 
4026 #ifndef _LP64
4027 static bool use_x87_registers() { return UseSSE < 2; }
4028 #endif
4029 static bool use_xmm_registers() { return UseSSE >= 1; }
4030 
4031 // C1 only ever uses the first double/float of the XMM register.
4032 static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); }
4033 
4034 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
4035   if (UseSSE == 1) {
4036     masm->movflt(Address(rsp, offset), reg);
4037   } else {
4038     masm->movdbl(Address(rsp, offset), reg);
4039   }
4040 }
4041 
4042 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
4043   if (UseSSE == 1) {
4044     masm->movflt(reg, Address(rsp, offset));
4045   } else {
4046     masm->movdbl(reg, Address(rsp, offset));
4047   }
4048 }
4049 
4050 int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers, bool save_fpu,
4051                            int& gp_area_size, int& fp_area_size, int& xmm_area_size) {
4052 
4053   gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
4054                          StackAlignmentInBytes);
4055 #ifdef _LP64
4056   fp_area_size = 0;
4057 #else
4058   fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0;
4059 #endif
4060   xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0;
4061 
4062   return gp_area_size + fp_area_size + xmm_area_size;
4063 }
4064 
4065 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
4066   block_comment("push_call_clobbered_registers start");
4067   // Regular registers
4068   RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
4069 
4070   int gp_area_size;
4071   int fp_area_size;
4072   int xmm_area_size;
4073   int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
4074                                                gp_area_size, fp_area_size, xmm_area_size);
4075   subptr(rsp, total_save_size);
4076 
4077   push_set(gp_registers_to_push, 0);
4078 
4079 #ifndef _LP64
4080   if (save_fpu && use_x87_registers()) {
4081     fnsave(Address(rsp, gp_area_size));
4082     fwait();
4083   }
4084 #endif
4085   if (save_fpu && use_xmm_registers()) {
4086     push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
4087   }
4088 
4089   block_comment("push_call_clobbered_registers end");
4090 }
4091 
4092 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
4093   block_comment("pop_call_clobbered_registers start");
4094 
4095   RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
4096 
4097   int gp_area_size;
4098   int fp_area_size;
4099   int xmm_area_size;
4100   int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
4101                                                gp_area_size, fp_area_size, xmm_area_size);
4102 
4103   if (restore_fpu && use_xmm_registers()) {
4104     pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
4105   }
4106 #ifndef _LP64
4107   if (restore_fpu && use_x87_registers()) {
4108     frstor(Address(rsp, gp_area_size));
4109   }
4110 #endif
4111 
4112   pop_set(gp_registers_to_pop, 0);
4113 
4114   addptr(rsp, total_save_size);
4115 
4116   vzeroupper();
4117 
4118   block_comment("pop_call_clobbered_registers end");
4119 }
4120 
4121 void MacroAssembler::push_set(XMMRegSet set, int offset) {
4122   assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
4123   int spill_offset = offset;
4124 
4125   for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
4126     save_xmm_register(this, spill_offset, *it);
4127     spill_offset += xmm_save_size();
4128   }
4129 }
4130 
4131 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
4132   int restore_size = set.size() * xmm_save_size();
4133   assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
4134 
4135   int restore_offset = offset + restore_size - xmm_save_size();
4136 
4137   for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
4138     restore_xmm_register(this, restore_offset, *it);
4139     restore_offset -= xmm_save_size();
4140   }
4141 }
4142 
4143 void MacroAssembler::push_set(RegSet set, int offset) {
4144   int spill_offset;
4145   if (offset == -1) {
4146     int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4147     int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4148     subptr(rsp, aligned_size);
4149     spill_offset = 0;
4150   } else {
4151     spill_offset = offset;
4152   }
4153 
4154   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
4155     movptr(Address(rsp, spill_offset), *it);
4156     spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4157   }
4158 }
4159 
4160 void MacroAssembler::pop_set(RegSet set, int offset) {
4161 
4162   int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4163   int restore_size = set.size() * gp_reg_size;
4164   int aligned_size = align_up(restore_size, StackAlignmentInBytes);
4165 
4166   int restore_offset;
4167   if (offset == -1) {
4168     restore_offset = restore_size - gp_reg_size;
4169   } else {
4170     restore_offset = offset + restore_size - gp_reg_size;
4171   }
4172   for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
4173     movptr(*it, Address(rsp, restore_offset));
4174     restore_offset -= gp_reg_size;
4175   }
4176 
4177   if (offset == -1) {
4178     addptr(rsp, aligned_size);
4179   }
4180 }
4181 
4182 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
4183 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4184   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4185   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4186   Label done;
4187 
4188   testptr(length_in_bytes, length_in_bytes);
4189   jcc(Assembler::zero, done);
4190 
4191   // initialize topmost word, divide index by 2, check if odd and test if zero
4192   // note: for the remaining code to work, index must be a multiple of BytesPerWord
4193 #ifdef ASSERT
4194   {
4195     Label L;
4196     testptr(length_in_bytes, BytesPerWord - 1);
4197     jcc(Assembler::zero, L);
4198     stop("length must be a multiple of BytesPerWord");
4199     bind(L);
4200   }
4201 #endif
4202   Register index = length_in_bytes;
4203   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
4204   if (UseIncDec) {
4205     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
4206   } else {
4207     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
4208     shrptr(index, 1);
4209   }
4210 #ifndef _LP64
4211   // index could have not been a multiple of 8 (i.e., bit 2 was set)
4212   {
4213     Label even;
4214     // note: if index was a multiple of 8, then it cannot
4215     //       be 0 now otherwise it must have been 0 before
4216     //       => if it is even, we don't need to check for 0 again
4217     jcc(Assembler::carryClear, even);
4218     // clear topmost word (no jump would be needed if conditional assignment worked here)
4219     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4220     // index could be 0 now, must check again
4221     jcc(Assembler::zero, done);
4222     bind(even);
4223   }
4224 #endif // !_LP64
4225   // initialize remaining object fields: index is a multiple of 2 now
4226   {
4227     Label loop;
4228     bind(loop);
4229     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4230     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4231     decrement(index);
4232     jcc(Assembler::notZero, loop);
4233   }
4234 
4235   bind(done);
4236 }
4237 
4238 // Look up the method for a megamorphic invokeinterface call.
4239 // The target method is determined by <intf_klass, itable_index>.
4240 // The receiver klass is in recv_klass.
4241 // On success, the result will be in method_result, and execution falls through.
4242 // On failure, execution transfers to the given label.
4243 void MacroAssembler::lookup_interface_method(Register recv_klass,
4244                                              Register intf_klass,
4245                                              RegisterOrConstant itable_index,
4246                                              Register method_result,
4247                                              Register scan_temp,
4248                                              Label& L_no_such_interface,
4249                                              bool return_method) {
4250   assert_different_registers(recv_klass, intf_klass, scan_temp);
4251   assert_different_registers(method_result, intf_klass, scan_temp);
4252   assert(recv_klass != method_result || !return_method,
4253          "recv_klass can be destroyed when method isn't needed");
4254 
4255   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4256          "caller must use same register for non-constant itable index as for method");
4257 
4258   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4259   int vtable_base = in_bytes(Klass::vtable_start_offset());
4260   int itentry_off = in_bytes(itableMethodEntry::method_offset());
4261   int scan_step   = itableOffsetEntry::size() * wordSize;
4262   int vte_size    = vtableEntry::size_in_bytes();
4263   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4264   assert(vte_size == wordSize, "else adjust times_vte_scale");
4265 
4266   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4267 
4268   // %%% Could store the aligned, prescaled offset in the klassoop.
4269   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4270 
4271   if (return_method) {
4272     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4273     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4274     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4275   }
4276 
4277   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
4278   //   if (scan->interface() == intf) {
4279   //     result = (klass + scan->offset() + itable_index);
4280   //   }
4281   // }
4282   Label search, found_method;
4283 
4284   for (int peel = 1; peel >= 0; peel--) {
4285     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
4286     cmpptr(intf_klass, method_result);
4287 
4288     if (peel) {
4289       jccb(Assembler::equal, found_method);
4290     } else {
4291       jccb(Assembler::notEqual, search);
4292       // (invert the test to fall through to found_method...)
4293     }
4294 
4295     if (!peel)  break;
4296 
4297     bind(search);
4298 
4299     // Check that the previous entry is non-null.  A null entry means that
4300     // the receiver class doesn't implement the interface, and wasn't the
4301     // same as when the caller was compiled.
4302     testptr(method_result, method_result);
4303     jcc(Assembler::zero, L_no_such_interface);
4304     addptr(scan_temp, scan_step);
4305   }
4306 
4307   bind(found_method);
4308 
4309   if (return_method) {
4310     // Got a hit.
4311     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
4312     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4313   }
4314 }
4315 
4316 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
4317 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICHolder
4318 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
4319 // The target method is determined by <holder_klass, itable_index>.
4320 // The receiver klass is in recv_klass.
4321 // On success, the result will be in method_result, and execution falls through.
4322 // On failure, execution transfers to the given label.
4323 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
4324                                                   Register holder_klass,
4325                                                   Register resolved_klass,
4326                                                   Register method_result,
4327                                                   Register scan_temp,
4328                                                   Register temp_reg2,
4329                                                   Register receiver,
4330                                                   int itable_index,
4331                                                   Label& L_no_such_interface) {
4332   assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
4333   Register temp_itbl_klass = method_result;
4334   Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
4335 
4336   int vtable_base = in_bytes(Klass::vtable_start_offset());
4337   int itentry_off = in_bytes(itableMethodEntry::method_offset());
4338   int scan_step = itableOffsetEntry::size() * wordSize;
4339   int vte_size = vtableEntry::size_in_bytes();
4340   int ioffset = in_bytes(itableOffsetEntry::interface_offset());
4341   int ooffset = in_bytes(itableOffsetEntry::offset_offset());
4342   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4343   assert(vte_size == wordSize, "adjust times_vte_scale");
4344 
4345   Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
4346 
4347   // temp_itbl_klass = recv_klass.itable[0]
4348   // scan_temp = &recv_klass.itable[0] + step
4349   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4350   movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
4351   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
4352   xorptr(temp_reg, temp_reg);
4353 
4354   // Initial checks:
4355   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
4356   //   - if (itable[0] == 0), no such interface
4357   //   - if (itable[0] == holder_klass), shortcut to "holder found"
4358   cmpptr(holder_klass, resolved_klass);
4359   jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
4360   testptr(temp_itbl_klass, temp_itbl_klass);
4361   jccb(Assembler::zero, L_no_such_interface);
4362   cmpptr(holder_klass, temp_itbl_klass);
4363   jccb(Assembler::equal, L_holder_found);
4364 
4365   // Loop: Look for holder_klass record in itable
4366   //   do {
4367   //     tmp = itable[index];
4368   //     index += step;
4369   //     if (tmp == holder_klass) {
4370   //       goto L_holder_found; // Found!
4371   //     }
4372   //   } while (tmp != 0);
4373   //   goto L_no_such_interface // Not found.
4374   Label L_scan_holder;
4375   bind(L_scan_holder);
4376     movptr(temp_itbl_klass, Address(scan_temp, 0));
4377     addptr(scan_temp, scan_step);
4378     cmpptr(holder_klass, temp_itbl_klass);
4379     jccb(Assembler::equal, L_holder_found);
4380     testptr(temp_itbl_klass, temp_itbl_klass);
4381     jccb(Assembler::notZero, L_scan_holder);
4382 
4383   jmpb(L_no_such_interface);
4384 
4385   // Loop: Look for resolved_class record in itable
4386   //   do {
4387   //     tmp = itable[index];
4388   //     index += step;
4389   //     if (tmp == holder_klass) {
4390   //        // Also check if we have met a holder klass
4391   //        holder_tmp = itable[index-step-ioffset];
4392   //     }
4393   //     if (tmp == resolved_klass) {
4394   //        goto L_resolved_found;  // Found!
4395   //     }
4396   //   } while (tmp != 0);
4397   //   goto L_no_such_interface // Not found.
4398   //
4399   Label L_loop_scan_resolved;
4400   bind(L_loop_scan_resolved);
4401     movptr(temp_itbl_klass, Address(scan_temp, 0));
4402     addptr(scan_temp, scan_step);
4403     bind(L_loop_scan_resolved_entry);
4404     cmpptr(holder_klass, temp_itbl_klass);
4405     cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4406     cmpptr(resolved_klass, temp_itbl_klass);
4407     jccb(Assembler::equal, L_resolved_found);
4408     testptr(temp_itbl_klass, temp_itbl_klass);
4409     jccb(Assembler::notZero, L_loop_scan_resolved);
4410 
4411   jmpb(L_no_such_interface);
4412 
4413   Label L_ready;
4414 
4415   // See if we already have a holder klass. If not, go and scan for it.
4416   bind(L_resolved_found);
4417   testptr(temp_reg, temp_reg);
4418   jccb(Assembler::zero, L_scan_holder);
4419   jmpb(L_ready);
4420 
4421   bind(L_holder_found);
4422   movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4423 
4424   // Finally, temp_reg contains holder_klass vtable offset
4425   bind(L_ready);
4426   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4427   if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
4428     load_klass(scan_temp, receiver, noreg);
4429     movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4430   } else {
4431     movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4432   }
4433 }
4434 
4435 
4436 // virtual method calling
4437 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4438                                            RegisterOrConstant vtable_index,
4439                                            Register method_result) {
4440   const ByteSize base = Klass::vtable_start_offset();
4441   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4442   Address vtable_entry_addr(recv_klass,
4443                             vtable_index, Address::times_ptr,
4444                             base + vtableEntry::method_offset());
4445   movptr(method_result, vtable_entry_addr);
4446 }
4447 
4448 
4449 void MacroAssembler::check_klass_subtype(Register sub_klass,
4450                            Register super_klass,
4451                            Register temp_reg,
4452                            Label& L_success) {
4453   Label L_failure;
4454   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, nullptr);
4455   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
4456   bind(L_failure);
4457 }
4458 
4459 
4460 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4461                                                    Register super_klass,
4462                                                    Register temp_reg,
4463                                                    Label* L_success,
4464                                                    Label* L_failure,
4465                                                    Label* L_slow_path,
4466                                         RegisterOrConstant super_check_offset) {
4467   assert_different_registers(sub_klass, super_klass, temp_reg);
4468   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4469   if (super_check_offset.is_register()) {
4470     assert_different_registers(sub_klass, super_klass,
4471                                super_check_offset.as_register());
4472   } else if (must_load_sco) {
4473     assert(temp_reg != noreg, "supply either a temp or a register offset");
4474   }
4475 
4476   Label L_fallthrough;
4477   int label_nulls = 0;
4478   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4479   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4480   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4481   assert(label_nulls <= 1, "at most one null in the batch");
4482 
4483   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4484   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4485   Address super_check_offset_addr(super_klass, sco_offset);
4486 
4487   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4488   // range of a jccb.  If this routine grows larger, reconsider at
4489   // least some of these.
4490 #define local_jcc(assembler_cond, label)                                \
4491   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4492   else                             jcc( assembler_cond, label) /*omit semi*/
4493 
4494   // Hacked jmp, which may only be used just before L_fallthrough.
4495 #define final_jmp(label)                                                \
4496   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4497   else                            jmp(label)                /*omit semi*/
4498 
4499   // If the pointers are equal, we are done (e.g., String[] elements).
4500   // This self-check enables sharing of secondary supertype arrays among
4501   // non-primary types such as array-of-interface.  Otherwise, each such
4502   // type would need its own customized SSA.
4503   // We move this check to the front of the fast path because many
4504   // type checks are in fact trivially successful in this manner,
4505   // so we get a nicely predicted branch right at the start of the check.
4506   cmpptr(sub_klass, super_klass);
4507   local_jcc(Assembler::equal, *L_success);
4508 
4509   // Check the supertype display:
4510   if (must_load_sco) {
4511     // Positive movl does right thing on LP64.
4512     movl(temp_reg, super_check_offset_addr);
4513     super_check_offset = RegisterOrConstant(temp_reg);
4514   }
4515   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4516   cmpptr(super_klass, super_check_addr); // load displayed supertype
4517 
4518   // This check has worked decisively for primary supers.
4519   // Secondary supers are sought in the super_cache ('super_cache_addr').
4520   // (Secondary supers are interfaces and very deeply nested subtypes.)
4521   // This works in the same check above because of a tricky aliasing
4522   // between the super_cache and the primary super display elements.
4523   // (The 'super_check_addr' can address either, as the case requires.)
4524   // Note that the cache is updated below if it does not help us find
4525   // what we need immediately.
4526   // So if it was a primary super, we can just fail immediately.
4527   // Otherwise, it's the slow path for us (no success at this point).
4528 
4529   if (super_check_offset.is_register()) {
4530     local_jcc(Assembler::equal, *L_success);
4531     cmpl(super_check_offset.as_register(), sc_offset);
4532     if (L_failure == &L_fallthrough) {
4533       local_jcc(Assembler::equal, *L_slow_path);
4534     } else {
4535       local_jcc(Assembler::notEqual, *L_failure);
4536       final_jmp(*L_slow_path);
4537     }
4538   } else if (super_check_offset.as_constant() == sc_offset) {
4539     // Need a slow path; fast failure is impossible.
4540     if (L_slow_path == &L_fallthrough) {
4541       local_jcc(Assembler::equal, *L_success);
4542     } else {
4543       local_jcc(Assembler::notEqual, *L_slow_path);
4544       final_jmp(*L_success);
4545     }
4546   } else {
4547     // No slow path; it's a fast decision.
4548     if (L_failure == &L_fallthrough) {
4549       local_jcc(Assembler::equal, *L_success);
4550     } else {
4551       local_jcc(Assembler::notEqual, *L_failure);
4552       final_jmp(*L_success);
4553     }
4554   }
4555 
4556   bind(L_fallthrough);
4557 
4558 #undef local_jcc
4559 #undef final_jmp
4560 }
4561 
4562 
4563 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4564                                                    Register super_klass,
4565                                                    Register temp_reg,
4566                                                    Register temp2_reg,
4567                                                    Label* L_success,
4568                                                    Label* L_failure,
4569                                                    bool set_cond_codes) {
4570   assert_different_registers(sub_klass, super_klass, temp_reg);
4571   if (temp2_reg != noreg)
4572     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4573 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4574 
4575   Label L_fallthrough;
4576   int label_nulls = 0;
4577   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4578   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4579   assert(label_nulls <= 1, "at most one null in the batch");
4580 
4581   // a couple of useful fields in sub_klass:
4582   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4583   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4584   Address secondary_supers_addr(sub_klass, ss_offset);
4585   Address super_cache_addr(     sub_klass, sc_offset);
4586 
4587   // Do a linear scan of the secondary super-klass chain.
4588   // This code is rarely used, so simplicity is a virtue here.
4589   // The repne_scan instruction uses fixed registers, which we must spill.
4590   // Don't worry too much about pre-existing connections with the input regs.
4591 
4592   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4593   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4594 
4595   // Get super_klass value into rax (even if it was in rdi or rcx).
4596   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4597   if (super_klass != rax) {
4598     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4599     mov(rax, super_klass);
4600   }
4601   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4602   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4603 
4604 #ifndef PRODUCT
4605   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4606   ExternalAddress pst_counter_addr((address) pst_counter);
4607   NOT_LP64(  incrementl(pst_counter_addr) );
4608   LP64_ONLY( lea(rcx, pst_counter_addr) );
4609   LP64_ONLY( incrementl(Address(rcx, 0)) );
4610 #endif //PRODUCT
4611 
4612   // We will consult the secondary-super array.
4613   movptr(rdi, secondary_supers_addr);
4614   // Load the array length.  (Positive movl does right thing on LP64.)
4615   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4616   // Skip to start of data.
4617   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4618 
4619   // Scan RCX words at [RDI] for an occurrence of RAX.
4620   // Set NZ/Z based on last compare.
4621   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4622   // not change flags (only scas instruction which is repeated sets flags).
4623   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4624 
4625     testptr(rax,rax); // Set Z = 0
4626     repne_scan();
4627 
4628   // Unspill the temp. registers:
4629   if (pushed_rdi)  pop(rdi);
4630   if (pushed_rcx)  pop(rcx);
4631   if (pushed_rax)  pop(rax);
4632 
4633   if (set_cond_codes) {
4634     // Special hack for the AD files:  rdi is guaranteed non-zero.
4635     assert(!pushed_rdi, "rdi must be left non-null");
4636     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4637   }
4638 
4639   if (L_failure == &L_fallthrough)
4640         jccb(Assembler::notEqual, *L_failure);
4641   else  jcc(Assembler::notEqual, *L_failure);
4642 
4643   // Success.  Cache the super we found and proceed in triumph.
4644   movptr(super_cache_addr, super_klass);
4645 
4646   if (L_success != &L_fallthrough) {
4647     jmp(*L_success);
4648   }
4649 
4650 #undef IS_A_TEMP
4651 
4652   bind(L_fallthrough);
4653 }
4654 
4655 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4656   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
4657 
4658   Label L_fallthrough;
4659   if (L_fast_path == nullptr) {
4660     L_fast_path = &L_fallthrough;
4661   } else if (L_slow_path == nullptr) {
4662     L_slow_path = &L_fallthrough;
4663   }
4664 
4665   // Fast path check: class is fully initialized
4666   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4667   jcc(Assembler::equal, *L_fast_path);
4668 
4669   // Fast path check: current thread is initializer thread
4670   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4671   if (L_slow_path == &L_fallthrough) {
4672     jcc(Assembler::equal, *L_fast_path);
4673     bind(*L_slow_path);
4674   } else if (L_fast_path == &L_fallthrough) {
4675     jcc(Assembler::notEqual, *L_slow_path);
4676     bind(*L_fast_path);
4677   } else {
4678     Unimplemented();
4679   }
4680 }
4681 
4682 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4683   if (VM_Version::supports_cmov()) {
4684     cmovl(cc, dst, src);
4685   } else {
4686     Label L;
4687     jccb(negate_condition(cc), L);
4688     movl(dst, src);
4689     bind(L);
4690   }
4691 }
4692 
4693 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4694   if (VM_Version::supports_cmov()) {
4695     cmovl(cc, dst, src);
4696   } else {
4697     Label L;
4698     jccb(negate_condition(cc), L);
4699     movl(dst, src);
4700     bind(L);
4701   }
4702 }
4703 
4704 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4705   if (!VerifyOops) return;
4706 
4707   BLOCK_COMMENT("verify_oop {");
4708 #ifdef _LP64
4709   push(rscratch1);
4710 #endif
4711   push(rax);                          // save rax
4712   push(reg);                          // pass register argument
4713 
4714   // Pass register number to verify_oop_subroutine
4715   const char* b = nullptr;
4716   {
4717     ResourceMark rm;
4718     stringStream ss;
4719     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4720     b = code_string(ss.as_string());
4721   }
4722   ExternalAddress buffer((address) b);
4723   pushptr(buffer.addr(), rscratch1);
4724 
4725   // call indirectly to solve generation ordering problem
4726   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4727   call(rax);
4728   // Caller pops the arguments (oop, message) and restores rax, r10
4729   BLOCK_COMMENT("} verify_oop");
4730 }
4731 
4732 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4733   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4734     // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
4735     // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
4736     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4737   } else if (VM_Version::supports_avx()) {
4738     vpcmpeqd(dst, dst, dst, vector_len);
4739   } else {
4740     assert(VM_Version::supports_sse2(), "");
4741     pcmpeqd(dst, dst);
4742   }
4743 }
4744 
4745 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4746                                          int extra_slot_offset) {
4747   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4748   int stackElementSize = Interpreter::stackElementSize;
4749   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4750 #ifdef ASSERT
4751   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4752   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4753 #endif
4754   Register             scale_reg    = noreg;
4755   Address::ScaleFactor scale_factor = Address::no_scale;
4756   if (arg_slot.is_constant()) {
4757     offset += arg_slot.as_constant() * stackElementSize;
4758   } else {
4759     scale_reg    = arg_slot.as_register();
4760     scale_factor = Address::times(stackElementSize);
4761   }
4762   offset += wordSize;           // return PC is on stack
4763   return Address(rsp, scale_reg, scale_factor, offset);
4764 }
4765 
4766 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4767   if (!VerifyOops) return;
4768 
4769 #ifdef _LP64
4770   push(rscratch1);
4771 #endif
4772   push(rax); // save rax,
4773   // addr may contain rsp so we will have to adjust it based on the push
4774   // we just did (and on 64 bit we do two pushes)
4775   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4776   // stores rax into addr which is backwards of what was intended.
4777   if (addr.uses(rsp)) {
4778     lea(rax, addr);
4779     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4780   } else {
4781     pushptr(addr);
4782   }
4783 
4784   // Pass register number to verify_oop_subroutine
4785   const char* b = nullptr;
4786   {
4787     ResourceMark rm;
4788     stringStream ss;
4789     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4790     b = code_string(ss.as_string());
4791   }
4792   ExternalAddress buffer((address) b);
4793   pushptr(buffer.addr(), rscratch1);
4794 
4795   // call indirectly to solve generation ordering problem
4796   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4797   call(rax);
4798   // Caller pops the arguments (addr, message) and restores rax, r10.
4799 }
4800 
4801 void MacroAssembler::verify_tlab() {
4802 #ifdef ASSERT
4803   if (UseTLAB && VerifyOops) {
4804     Label next, ok;
4805     Register t1 = rsi;
4806     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4807 
4808     push(t1);
4809     NOT_LP64(push(thread_reg));
4810     NOT_LP64(get_thread(thread_reg));
4811 
4812     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4813     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4814     jcc(Assembler::aboveEqual, next);
4815     STOP("assert(top >= start)");
4816     should_not_reach_here();
4817 
4818     bind(next);
4819     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4820     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4821     jcc(Assembler::aboveEqual, ok);
4822     STOP("assert(top <= end)");
4823     should_not_reach_here();
4824 
4825     bind(ok);
4826     NOT_LP64(pop(thread_reg));
4827     pop(t1);
4828   }
4829 #endif
4830 }
4831 
4832 class ControlWord {
4833  public:
4834   int32_t _value;
4835 
4836   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4837   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4838   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4839   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4840   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4841   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4842   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4843   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4844 
4845   void print() const {
4846     // rounding control
4847     const char* rc;
4848     switch (rounding_control()) {
4849       case 0: rc = "round near"; break;
4850       case 1: rc = "round down"; break;
4851       case 2: rc = "round up  "; break;
4852       case 3: rc = "chop      "; break;
4853       default:
4854         rc = nullptr; // silence compiler warnings
4855         fatal("Unknown rounding control: %d", rounding_control());
4856     };
4857     // precision control
4858     const char* pc;
4859     switch (precision_control()) {
4860       case 0: pc = "24 bits "; break;
4861       case 1: pc = "reserved"; break;
4862       case 2: pc = "53 bits "; break;
4863       case 3: pc = "64 bits "; break;
4864       default:
4865         pc = nullptr; // silence compiler warnings
4866         fatal("Unknown precision control: %d", precision_control());
4867     };
4868     // flags
4869     char f[9];
4870     f[0] = ' ';
4871     f[1] = ' ';
4872     f[2] = (precision   ()) ? 'P' : 'p';
4873     f[3] = (underflow   ()) ? 'U' : 'u';
4874     f[4] = (overflow    ()) ? 'O' : 'o';
4875     f[5] = (zero_divide ()) ? 'Z' : 'z';
4876     f[6] = (denormalized()) ? 'D' : 'd';
4877     f[7] = (invalid     ()) ? 'I' : 'i';
4878     f[8] = '\x0';
4879     // output
4880     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4881   }
4882 
4883 };
4884 
4885 class StatusWord {
4886  public:
4887   int32_t _value;
4888 
4889   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4890   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4891   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4892   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4893   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4894   int  top() const                     { return  (_value >> 11) & 7      ; }
4895   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4896   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4897   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4898   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4899   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4900   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4901   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4902   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4903 
4904   void print() const {
4905     // condition codes
4906     char c[5];
4907     c[0] = (C3()) ? '3' : '-';
4908     c[1] = (C2()) ? '2' : '-';
4909     c[2] = (C1()) ? '1' : '-';
4910     c[3] = (C0()) ? '0' : '-';
4911     c[4] = '\x0';
4912     // flags
4913     char f[9];
4914     f[0] = (error_status()) ? 'E' : '-';
4915     f[1] = (stack_fault ()) ? 'S' : '-';
4916     f[2] = (precision   ()) ? 'P' : '-';
4917     f[3] = (underflow   ()) ? 'U' : '-';
4918     f[4] = (overflow    ()) ? 'O' : '-';
4919     f[5] = (zero_divide ()) ? 'Z' : '-';
4920     f[6] = (denormalized()) ? 'D' : '-';
4921     f[7] = (invalid     ()) ? 'I' : '-';
4922     f[8] = '\x0';
4923     // output
4924     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4925   }
4926 
4927 };
4928 
4929 class TagWord {
4930  public:
4931   int32_t _value;
4932 
4933   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4934 
4935   void print() const {
4936     printf("%04x", _value & 0xFFFF);
4937   }
4938 
4939 };
4940 
4941 class FPU_Register {
4942  public:
4943   int32_t _m0;
4944   int32_t _m1;
4945   int16_t _ex;
4946 
4947   bool is_indefinite() const           {
4948     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4949   }
4950 
4951   void print() const {
4952     char  sign = (_ex < 0) ? '-' : '+';
4953     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4954     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4955   };
4956 
4957 };
4958 
4959 class FPU_State {
4960  public:
4961   enum {
4962     register_size       = 10,
4963     number_of_registers =  8,
4964     register_mask       =  7
4965   };
4966 
4967   ControlWord  _control_word;
4968   StatusWord   _status_word;
4969   TagWord      _tag_word;
4970   int32_t      _error_offset;
4971   int32_t      _error_selector;
4972   int32_t      _data_offset;
4973   int32_t      _data_selector;
4974   int8_t       _register[register_size * number_of_registers];
4975 
4976   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4977   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4978 
4979   const char* tag_as_string(int tag) const {
4980     switch (tag) {
4981       case 0: return "valid";
4982       case 1: return "zero";
4983       case 2: return "special";
4984       case 3: return "empty";
4985     }
4986     ShouldNotReachHere();
4987     return nullptr;
4988   }
4989 
4990   void print() const {
4991     // print computation registers
4992     { int t = _status_word.top();
4993       for (int i = 0; i < number_of_registers; i++) {
4994         int j = (i - t) & register_mask;
4995         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4996         st(j)->print();
4997         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4998       }
4999     }
5000     printf("\n");
5001     // print control registers
5002     printf("ctrl = "); _control_word.print(); printf("\n");
5003     printf("stat = "); _status_word .print(); printf("\n");
5004     printf("tags = "); _tag_word    .print(); printf("\n");
5005   }
5006 
5007 };
5008 
5009 class Flag_Register {
5010  public:
5011   int32_t _value;
5012 
5013   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5014   bool direction() const               { return ((_value >> 10) & 1) != 0; }
5015   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5016   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5017   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5018   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5019   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5020 
5021   void print() const {
5022     // flags
5023     char f[8];
5024     f[0] = (overflow       ()) ? 'O' : '-';
5025     f[1] = (direction      ()) ? 'D' : '-';
5026     f[2] = (sign           ()) ? 'S' : '-';
5027     f[3] = (zero           ()) ? 'Z' : '-';
5028     f[4] = (auxiliary_carry()) ? 'A' : '-';
5029     f[5] = (parity         ()) ? 'P' : '-';
5030     f[6] = (carry          ()) ? 'C' : '-';
5031     f[7] = '\x0';
5032     // output
5033     printf("%08x  flags = %s", _value, f);
5034   }
5035 
5036 };
5037 
5038 class IU_Register {
5039  public:
5040   int32_t _value;
5041 
5042   void print() const {
5043     printf("%08x  %11d", _value, _value);
5044   }
5045 
5046 };
5047 
5048 class IU_State {
5049  public:
5050   Flag_Register _eflags;
5051   IU_Register   _rdi;
5052   IU_Register   _rsi;
5053   IU_Register   _rbp;
5054   IU_Register   _rsp;
5055   IU_Register   _rbx;
5056   IU_Register   _rdx;
5057   IU_Register   _rcx;
5058   IU_Register   _rax;
5059 
5060   void print() const {
5061     // computation registers
5062     printf("rax,  = "); _rax.print(); printf("\n");
5063     printf("rbx,  = "); _rbx.print(); printf("\n");
5064     printf("rcx  = "); _rcx.print(); printf("\n");
5065     printf("rdx  = "); _rdx.print(); printf("\n");
5066     printf("rdi  = "); _rdi.print(); printf("\n");
5067     printf("rsi  = "); _rsi.print(); printf("\n");
5068     printf("rbp,  = "); _rbp.print(); printf("\n");
5069     printf("rsp  = "); _rsp.print(); printf("\n");
5070     printf("\n");
5071     // control registers
5072     printf("flgs = "); _eflags.print(); printf("\n");
5073   }
5074 };
5075 
5076 
5077 class CPU_State {
5078  public:
5079   FPU_State _fpu_state;
5080   IU_State  _iu_state;
5081 
5082   void print() const {
5083     printf("--------------------------------------------------\n");
5084     _iu_state .print();
5085     printf("\n");
5086     _fpu_state.print();
5087     printf("--------------------------------------------------\n");
5088   }
5089 
5090 };
5091 
5092 
5093 static void _print_CPU_state(CPU_State* state) {
5094   state->print();
5095 };
5096 
5097 
5098 void MacroAssembler::print_CPU_state() {
5099   push_CPU_state();
5100   push(rsp);                // pass CPU state
5101   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5102   addptr(rsp, wordSize);       // discard argument
5103   pop_CPU_state();
5104 }
5105 
5106 
5107 #ifndef _LP64
5108 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5109   static int counter = 0;
5110   FPU_State* fs = &state->_fpu_state;
5111   counter++;
5112   // For leaf calls, only verify that the top few elements remain empty.
5113   // We only need 1 empty at the top for C2 code.
5114   if( stack_depth < 0 ) {
5115     if( fs->tag_for_st(7) != 3 ) {
5116       printf("FPR7 not empty\n");
5117       state->print();
5118       assert(false, "error");
5119       return false;
5120     }
5121     return true;                // All other stack states do not matter
5122   }
5123 
5124   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
5125          "bad FPU control word");
5126 
5127   // compute stack depth
5128   int i = 0;
5129   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5130   int d = i;
5131   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5132   // verify findings
5133   if (i != FPU_State::number_of_registers) {
5134     // stack not contiguous
5135     printf("%s: stack not contiguous at ST%d\n", s, i);
5136     state->print();
5137     assert(false, "error");
5138     return false;
5139   }
5140   // check if computed stack depth corresponds to expected stack depth
5141   if (stack_depth < 0) {
5142     // expected stack depth is -stack_depth or less
5143     if (d > -stack_depth) {
5144       // too many elements on the stack
5145       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5146       state->print();
5147       assert(false, "error");
5148       return false;
5149     }
5150   } else {
5151     // expected stack depth is stack_depth
5152     if (d != stack_depth) {
5153       // wrong stack depth
5154       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5155       state->print();
5156       assert(false, "error");
5157       return false;
5158     }
5159   }
5160   // everything is cool
5161   return true;
5162 }
5163 
5164 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5165   if (!VerifyFPU) return;
5166   push_CPU_state();
5167   push(rsp);                // pass CPU state
5168   ExternalAddress msg((address) s);
5169   // pass message string s
5170   pushptr(msg.addr(), noreg);
5171   push(stack_depth);        // pass stack depth
5172   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5173   addptr(rsp, 3 * wordSize);   // discard arguments
5174   // check for error
5175   { Label L;
5176     testl(rax, rax);
5177     jcc(Assembler::notZero, L);
5178     int3();                  // break if error condition
5179     bind(L);
5180   }
5181   pop_CPU_state();
5182 }
5183 #endif // _LP64
5184 
5185 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5186   // Either restore the MXCSR register after returning from the JNI Call
5187   // or verify that it wasn't changed (with -Xcheck:jni flag).
5188   if (VM_Version::supports_sse()) {
5189     if (RestoreMXCSROnJNICalls) {
5190       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5191     } else if (CheckJNICalls) {
5192       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5193     }
5194   }
5195   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5196   vzeroupper();
5197 
5198 #ifndef _LP64
5199   // Either restore the x87 floating pointer control word after returning
5200   // from the JNI call or verify that it wasn't changed.
5201   if (CheckJNICalls) {
5202     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5203   }
5204 #endif // _LP64
5205 }
5206 
5207 // ((OopHandle)result).resolve();
5208 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5209   assert_different_registers(result, tmp);
5210 
5211   // Only 64 bit platforms support GCs that require a tmp register
5212   // Only IN_HEAP loads require a thread_tmp register
5213   // OopHandle::resolve is an indirection like jobject.
5214   access_load_at(T_OBJECT, IN_NATIVE,
5215                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5216 }
5217 
5218 // ((WeakHandle)result).resolve();
5219 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5220   assert_different_registers(rresult, rtmp);
5221   Label resolved;
5222 
5223   // A null weak handle resolves to null.
5224   cmpptr(rresult, 0);
5225   jcc(Assembler::equal, resolved);
5226 
5227   // Only 64 bit platforms support GCs that require a tmp register
5228   // Only IN_HEAP loads require a thread_tmp register
5229   // WeakHandle::resolve is an indirection like jweak.
5230   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5231                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5232   bind(resolved);
5233 }
5234 
5235 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5236   // get mirror
5237   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5238   load_method_holder(mirror, method);
5239   movptr(mirror, Address(mirror, mirror_offset));
5240   resolve_oop_handle(mirror, tmp);
5241 }
5242 
5243 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5244   load_method_holder(rresult, rmethod);
5245   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5246 }
5247 
5248 void MacroAssembler::load_method_holder(Register holder, Register method) {
5249   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5250   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5251   movptr(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
5252 }
5253 
5254 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5255   assert_different_registers(src, tmp);
5256   assert_different_registers(dst, tmp);
5257 #ifdef _LP64
5258   if (UseCompressedClassPointers) {
5259     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5260     decode_klass_not_null(dst, tmp);
5261   } else
5262 #endif
5263     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5264 }
5265 
5266 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5267   assert_different_registers(src, tmp);
5268   assert_different_registers(dst, tmp);
5269 #ifdef _LP64
5270   if (UseCompressedClassPointers) {
5271     encode_klass_not_null(src, tmp);
5272     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5273   } else
5274 #endif
5275     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5276 }
5277 
5278 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5279                                     Register tmp1, Register thread_tmp) {
5280   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5281   decorators = AccessInternal::decorator_fixup(decorators, type);
5282   bool as_raw = (decorators & AS_RAW) != 0;
5283   if (as_raw) {
5284     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5285   } else {
5286     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5287   }
5288 }
5289 
5290 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5291                                      Register tmp1, Register tmp2, Register tmp3) {
5292   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5293   decorators = AccessInternal::decorator_fixup(decorators, type);
5294   bool as_raw = (decorators & AS_RAW) != 0;
5295   if (as_raw) {
5296     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5297   } else {
5298     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5299   }
5300 }
5301 
5302 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5303                                    Register thread_tmp, DecoratorSet decorators) {
5304   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5305 }
5306 
5307 // Doesn't do verification, generates fixed size code
5308 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5309                                             Register thread_tmp, DecoratorSet decorators) {
5310   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5311 }
5312 
5313 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5314                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
5315   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5316 }
5317 
5318 // Used for storing nulls.
5319 void MacroAssembler::store_heap_oop_null(Address dst) {
5320   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5321 }
5322 
5323 #ifdef _LP64
5324 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5325   if (UseCompressedClassPointers) {
5326     // Store to klass gap in destination
5327     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5328   }
5329 }
5330 
5331 #ifdef ASSERT
5332 void MacroAssembler::verify_heapbase(const char* msg) {
5333   assert (UseCompressedOops, "should be compressed");
5334   assert (Universe::heap() != nullptr, "java heap should be initialized");
5335   if (CheckCompressedOops) {
5336     Label ok;
5337     ExternalAddress src2(CompressedOops::ptrs_base_addr());
5338     const bool is_src2_reachable = reachable(src2);
5339     if (!is_src2_reachable) {
5340       push(rscratch1);  // cmpptr trashes rscratch1
5341     }
5342     cmpptr(r12_heapbase, src2, rscratch1);
5343     jcc(Assembler::equal, ok);
5344     STOP(msg);
5345     bind(ok);
5346     if (!is_src2_reachable) {
5347       pop(rscratch1);
5348     }
5349   }
5350 }
5351 #endif
5352 
5353 // Algorithm must match oop.inline.hpp encode_heap_oop.
5354 void MacroAssembler::encode_heap_oop(Register r) {
5355 #ifdef ASSERT
5356   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5357 #endif
5358   verify_oop_msg(r, "broken oop in encode_heap_oop");
5359   if (CompressedOops::base() == nullptr) {
5360     if (CompressedOops::shift() != 0) {
5361       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5362       shrq(r, LogMinObjAlignmentInBytes);
5363     }
5364     return;
5365   }
5366   testq(r, r);
5367   cmovq(Assembler::equal, r, r12_heapbase);
5368   subq(r, r12_heapbase);
5369   shrq(r, LogMinObjAlignmentInBytes);
5370 }
5371 
5372 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5373 #ifdef ASSERT
5374   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5375   if (CheckCompressedOops) {
5376     Label ok;
5377     testq(r, r);
5378     jcc(Assembler::notEqual, ok);
5379     STOP("null oop passed to encode_heap_oop_not_null");
5380     bind(ok);
5381   }
5382 #endif
5383   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5384   if (CompressedOops::base() != nullptr) {
5385     subq(r, r12_heapbase);
5386   }
5387   if (CompressedOops::shift() != 0) {
5388     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5389     shrq(r, LogMinObjAlignmentInBytes);
5390   }
5391 }
5392 
5393 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5394 #ifdef ASSERT
5395   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5396   if (CheckCompressedOops) {
5397     Label ok;
5398     testq(src, src);
5399     jcc(Assembler::notEqual, ok);
5400     STOP("null oop passed to encode_heap_oop_not_null2");
5401     bind(ok);
5402   }
5403 #endif
5404   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5405   if (dst != src) {
5406     movq(dst, src);
5407   }
5408   if (CompressedOops::base() != nullptr) {
5409     subq(dst, r12_heapbase);
5410   }
5411   if (CompressedOops::shift() != 0) {
5412     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5413     shrq(dst, LogMinObjAlignmentInBytes);
5414   }
5415 }
5416 
5417 void  MacroAssembler::decode_heap_oop(Register r) {
5418 #ifdef ASSERT
5419   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5420 #endif
5421   if (CompressedOops::base() == nullptr) {
5422     if (CompressedOops::shift() != 0) {
5423       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5424       shlq(r, LogMinObjAlignmentInBytes);
5425     }
5426   } else {
5427     Label done;
5428     shlq(r, LogMinObjAlignmentInBytes);
5429     jccb(Assembler::equal, done);
5430     addq(r, r12_heapbase);
5431     bind(done);
5432   }
5433   verify_oop_msg(r, "broken oop in decode_heap_oop");
5434 }
5435 
5436 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5437   // Note: it will change flags
5438   assert (UseCompressedOops, "should only be used for compressed headers");
5439   assert (Universe::heap() != nullptr, "java heap should be initialized");
5440   // Cannot assert, unverified entry point counts instructions (see .ad file)
5441   // vtableStubs also counts instructions in pd_code_size_limit.
5442   // Also do not verify_oop as this is called by verify_oop.
5443   if (CompressedOops::shift() != 0) {
5444     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5445     shlq(r, LogMinObjAlignmentInBytes);
5446     if (CompressedOops::base() != nullptr) {
5447       addq(r, r12_heapbase);
5448     }
5449   } else {
5450     assert (CompressedOops::base() == nullptr, "sanity");
5451   }
5452 }
5453 
5454 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5455   // Note: it will change flags
5456   assert (UseCompressedOops, "should only be used for compressed headers");
5457   assert (Universe::heap() != nullptr, "java heap should be initialized");
5458   // Cannot assert, unverified entry point counts instructions (see .ad file)
5459   // vtableStubs also counts instructions in pd_code_size_limit.
5460   // Also do not verify_oop as this is called by verify_oop.
5461   if (CompressedOops::shift() != 0) {
5462     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5463     if (LogMinObjAlignmentInBytes == Address::times_8) {
5464       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5465     } else {
5466       if (dst != src) {
5467         movq(dst, src);
5468       }
5469       shlq(dst, LogMinObjAlignmentInBytes);
5470       if (CompressedOops::base() != nullptr) {
5471         addq(dst, r12_heapbase);
5472       }
5473     }
5474   } else {
5475     assert (CompressedOops::base() == nullptr, "sanity");
5476     if (dst != src) {
5477       movq(dst, src);
5478     }
5479   }
5480 }
5481 
5482 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5483   assert_different_registers(r, tmp);
5484   if (CompressedKlassPointers::base() != nullptr) {
5485     mov64(tmp, (int64_t)CompressedKlassPointers::base());
5486     subq(r, tmp);
5487   }
5488   if (CompressedKlassPointers::shift() != 0) {
5489     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5490     shrq(r, LogKlassAlignmentInBytes);
5491   }
5492 }
5493 
5494 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5495   assert_different_registers(src, dst);
5496   if (CompressedKlassPointers::base() != nullptr) {
5497     mov64(dst, -(int64_t)CompressedKlassPointers::base());
5498     addq(dst, src);
5499   } else {
5500     movptr(dst, src);
5501   }
5502   if (CompressedKlassPointers::shift() != 0) {
5503     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5504     shrq(dst, LogKlassAlignmentInBytes);
5505   }
5506 }
5507 
5508 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5509   assert_different_registers(r, tmp);
5510   // Note: it will change flags
5511   assert(UseCompressedClassPointers, "should only be used for compressed headers");
5512   // Cannot assert, unverified entry point counts instructions (see .ad file)
5513   // vtableStubs also counts instructions in pd_code_size_limit.
5514   // Also do not verify_oop as this is called by verify_oop.
5515   if (CompressedKlassPointers::shift() != 0) {
5516     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5517     shlq(r, LogKlassAlignmentInBytes);
5518   }
5519   if (CompressedKlassPointers::base() != nullptr) {
5520     mov64(tmp, (int64_t)CompressedKlassPointers::base());
5521     addq(r, tmp);
5522   }
5523 }
5524 
5525 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5526   assert_different_registers(src, dst);
5527   // Note: it will change flags
5528   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5529   // Cannot assert, unverified entry point counts instructions (see .ad file)
5530   // vtableStubs also counts instructions in pd_code_size_limit.
5531   // Also do not verify_oop as this is called by verify_oop.
5532 
5533   if (CompressedKlassPointers::base() == nullptr &&
5534       CompressedKlassPointers::shift() == 0) {
5535     // The best case scenario is that there is no base or shift. Then it is already
5536     // a pointer that needs nothing but a register rename.
5537     movl(dst, src);
5538   } else {
5539     if (CompressedKlassPointers::base() != nullptr) {
5540       mov64(dst, (int64_t)CompressedKlassPointers::base());
5541     } else {
5542       xorq(dst, dst);
5543     }
5544     if (CompressedKlassPointers::shift() != 0) {
5545       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5546       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5547       leaq(dst, Address(dst, src, Address::times_8, 0));
5548     } else {
5549       addq(dst, src);
5550     }
5551   }
5552 }
5553 
5554 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5555   assert (UseCompressedOops, "should only be used for compressed headers");
5556   assert (Universe::heap() != nullptr, "java heap should be initialized");
5557   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5558   int oop_index = oop_recorder()->find_index(obj);
5559   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5560   mov_narrow_oop(dst, oop_index, rspec);
5561 }
5562 
5563 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5564   assert (UseCompressedOops, "should only be used for compressed headers");
5565   assert (Universe::heap() != nullptr, "java heap should be initialized");
5566   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5567   int oop_index = oop_recorder()->find_index(obj);
5568   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5569   mov_narrow_oop(dst, oop_index, rspec);
5570 }
5571 
5572 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5573   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5574   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5575   int klass_index = oop_recorder()->find_index(k);
5576   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5577   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5578 }
5579 
5580 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5581   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5582   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5583   int klass_index = oop_recorder()->find_index(k);
5584   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5585   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5586 }
5587 
5588 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5589   assert (UseCompressedOops, "should only be used for compressed headers");
5590   assert (Universe::heap() != nullptr, "java heap should be initialized");
5591   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5592   int oop_index = oop_recorder()->find_index(obj);
5593   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5594   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5595 }
5596 
5597 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5598   assert (UseCompressedOops, "should only be used for compressed headers");
5599   assert (Universe::heap() != nullptr, "java heap should be initialized");
5600   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5601   int oop_index = oop_recorder()->find_index(obj);
5602   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5603   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5604 }
5605 
5606 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5607   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5608   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5609   int klass_index = oop_recorder()->find_index(k);
5610   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5611   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5612 }
5613 
5614 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5615   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5616   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5617   int klass_index = oop_recorder()->find_index(k);
5618   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5619   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5620 }
5621 
5622 void MacroAssembler::reinit_heapbase() {
5623   if (UseCompressedOops) {
5624     if (Universe::heap() != nullptr) {
5625       if (CompressedOops::base() == nullptr) {
5626         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5627       } else {
5628         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5629       }
5630     } else {
5631       movptr(r12_heapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
5632     }
5633   }
5634 }
5635 
5636 #endif // _LP64
5637 
5638 #if COMPILER2_OR_JVMCI
5639 
5640 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5641 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5642   // cnt - number of qwords (8-byte words).
5643   // base - start address, qword aligned.
5644   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5645   bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
5646   if (use64byteVector) {
5647     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5648   } else if (MaxVectorSize >= 32) {
5649     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5650   } else {
5651     pxor(xtmp, xtmp);
5652   }
5653   jmp(L_zero_64_bytes);
5654 
5655   BIND(L_loop);
5656   if (MaxVectorSize >= 32) {
5657     fill64(base, 0, xtmp, use64byteVector);
5658   } else {
5659     movdqu(Address(base,  0), xtmp);
5660     movdqu(Address(base, 16), xtmp);
5661     movdqu(Address(base, 32), xtmp);
5662     movdqu(Address(base, 48), xtmp);
5663   }
5664   addptr(base, 64);
5665 
5666   BIND(L_zero_64_bytes);
5667   subptr(cnt, 8);
5668   jccb(Assembler::greaterEqual, L_loop);
5669 
5670   // Copy trailing 64 bytes
5671   if (use64byteVector) {
5672     addptr(cnt, 8);
5673     jccb(Assembler::equal, L_end);
5674     fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5675     jmp(L_end);
5676   } else {
5677     addptr(cnt, 4);
5678     jccb(Assembler::less, L_tail);
5679     if (MaxVectorSize >= 32) {
5680       vmovdqu(Address(base, 0), xtmp);
5681     } else {
5682       movdqu(Address(base,  0), xtmp);
5683       movdqu(Address(base, 16), xtmp);
5684     }
5685   }
5686   addptr(base, 32);
5687   subptr(cnt, 4);
5688 
5689   BIND(L_tail);
5690   addptr(cnt, 4);
5691   jccb(Assembler::lessEqual, L_end);
5692   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5693     fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5694   } else {
5695     decrement(cnt);
5696 
5697     BIND(L_sloop);
5698     movq(Address(base, 0), xtmp);
5699     addptr(base, 8);
5700     decrement(cnt);
5701     jccb(Assembler::greaterEqual, L_sloop);
5702   }
5703   BIND(L_end);
5704 }
5705 
5706 // Clearing constant sized memory using YMM/ZMM registers.
5707 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5708   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5709   bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
5710 
5711   int vector64_count = (cnt & (~0x7)) >> 3;
5712   cnt = cnt & 0x7;
5713   const int fill64_per_loop = 4;
5714   const int max_unrolled_fill64 = 8;
5715 
5716   // 64 byte initialization loop.
5717   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5718   int start64 = 0;
5719   if (vector64_count > max_unrolled_fill64) {
5720     Label LOOP;
5721     Register index = rtmp;
5722 
5723     start64 = vector64_count - (vector64_count % fill64_per_loop);
5724 
5725     movl(index, 0);
5726     BIND(LOOP);
5727     for (int i = 0; i < fill64_per_loop; i++) {
5728       fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5729     }
5730     addl(index, fill64_per_loop * 64);
5731     cmpl(index, start64 * 64);
5732     jccb(Assembler::less, LOOP);
5733   }
5734   for (int i = start64; i < vector64_count; i++) {
5735     fill64(base, i * 64, xtmp, use64byteVector);
5736   }
5737 
5738   // Clear remaining 64 byte tail.
5739   int disp = vector64_count * 64;
5740   if (cnt) {
5741     switch (cnt) {
5742       case 1:
5743         movq(Address(base, disp), xtmp);
5744         break;
5745       case 2:
5746         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
5747         break;
5748       case 3:
5749         movl(rtmp, 0x7);
5750         kmovwl(mask, rtmp);
5751         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
5752         break;
5753       case 4:
5754         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5755         break;
5756       case 5:
5757         if (use64byteVector) {
5758           movl(rtmp, 0x1F);
5759           kmovwl(mask, rtmp);
5760           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5761         } else {
5762           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5763           movq(Address(base, disp + 32), xtmp);
5764         }
5765         break;
5766       case 6:
5767         if (use64byteVector) {
5768           movl(rtmp, 0x3F);
5769           kmovwl(mask, rtmp);
5770           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5771         } else {
5772           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5773           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
5774         }
5775         break;
5776       case 7:
5777         if (use64byteVector) {
5778           movl(rtmp, 0x7F);
5779           kmovwl(mask, rtmp);
5780           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5781         } else {
5782           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5783           movl(rtmp, 0x7);
5784           kmovwl(mask, rtmp);
5785           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
5786         }
5787         break;
5788       default:
5789         fatal("Unexpected length : %d\n",cnt);
5790         break;
5791     }
5792   }
5793 }
5794 
5795 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5796                                bool is_large, KRegister mask) {
5797   // cnt      - number of qwords (8-byte words).
5798   // base     - start address, qword aligned.
5799   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5800   assert(base==rdi, "base register must be edi for rep stos");
5801   assert(tmp==rax,   "tmp register must be eax for rep stos");
5802   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5803   assert(InitArrayShortSize % BytesPerLong == 0,
5804     "InitArrayShortSize should be the multiple of BytesPerLong");
5805 
5806   Label DONE;
5807   if (!is_large || !UseXMMForObjInit) {
5808     xorptr(tmp, tmp);
5809   }
5810 
5811   if (!is_large) {
5812     Label LOOP, LONG;
5813     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5814     jccb(Assembler::greater, LONG);
5815 
5816     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5817 
5818     decrement(cnt);
5819     jccb(Assembler::negative, DONE); // Zero length
5820 
5821     // Use individual pointer-sized stores for small counts:
5822     BIND(LOOP);
5823     movptr(Address(base, cnt, Address::times_ptr), tmp);
5824     decrement(cnt);
5825     jccb(Assembler::greaterEqual, LOOP);
5826     jmpb(DONE);
5827 
5828     BIND(LONG);
5829   }
5830 
5831   // Use longer rep-prefixed ops for non-small counts:
5832   if (UseFastStosb) {
5833     shlptr(cnt, 3); // convert to number of bytes
5834     rep_stosb();
5835   } else if (UseXMMForObjInit) {
5836     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5837   } else {
5838     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5839     rep_stos();
5840   }
5841 
5842   BIND(DONE);
5843 }
5844 
5845 #endif //COMPILER2_OR_JVMCI
5846 
5847 
5848 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5849                                    Register to, Register value, Register count,
5850                                    Register rtmp, XMMRegister xtmp) {
5851   ShortBranchVerifier sbv(this);
5852   assert_different_registers(to, value, count, rtmp);
5853   Label L_exit;
5854   Label L_fill_2_bytes, L_fill_4_bytes;
5855 
5856 #if defined(COMPILER2) && defined(_LP64)
5857   if(MaxVectorSize >=32 &&
5858      VM_Version::supports_avx512vlbw() &&
5859      VM_Version::supports_bmi2()) {
5860     generate_fill_avx3(t, to, value, count, rtmp, xtmp);
5861     return;
5862   }
5863 #endif
5864 
5865   int shift = -1;
5866   switch (t) {
5867     case T_BYTE:
5868       shift = 2;
5869       break;
5870     case T_SHORT:
5871       shift = 1;
5872       break;
5873     case T_INT:
5874       shift = 0;
5875       break;
5876     default: ShouldNotReachHere();
5877   }
5878 
5879   if (t == T_BYTE) {
5880     andl(value, 0xff);
5881     movl(rtmp, value);
5882     shll(rtmp, 8);
5883     orl(value, rtmp);
5884   }
5885   if (t == T_SHORT) {
5886     andl(value, 0xffff);
5887   }
5888   if (t == T_BYTE || t == T_SHORT) {
5889     movl(rtmp, value);
5890     shll(rtmp, 16);
5891     orl(value, rtmp);
5892   }
5893 
5894   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5895   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5896   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5897     Label L_skip_align2;
5898     // align source address at 4 bytes address boundary
5899     if (t == T_BYTE) {
5900       Label L_skip_align1;
5901       // One byte misalignment happens only for byte arrays
5902       testptr(to, 1);
5903       jccb(Assembler::zero, L_skip_align1);
5904       movb(Address(to, 0), value);
5905       increment(to);
5906       decrement(count);
5907       BIND(L_skip_align1);
5908     }
5909     // Two bytes misalignment happens only for byte and short (char) arrays
5910     testptr(to, 2);
5911     jccb(Assembler::zero, L_skip_align2);
5912     movw(Address(to, 0), value);
5913     addptr(to, 2);
5914     subl(count, 1<<(shift-1));
5915     BIND(L_skip_align2);
5916   }
5917   if (UseSSE < 2) {
5918     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5919     // Fill 32-byte chunks
5920     subl(count, 8 << shift);
5921     jcc(Assembler::less, L_check_fill_8_bytes);
5922     align(16);
5923 
5924     BIND(L_fill_32_bytes_loop);
5925 
5926     for (int i = 0; i < 32; i += 4) {
5927       movl(Address(to, i), value);
5928     }
5929 
5930     addptr(to, 32);
5931     subl(count, 8 << shift);
5932     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5933     BIND(L_check_fill_8_bytes);
5934     addl(count, 8 << shift);
5935     jccb(Assembler::zero, L_exit);
5936     jmpb(L_fill_8_bytes);
5937 
5938     //
5939     // length is too short, just fill qwords
5940     //
5941     BIND(L_fill_8_bytes_loop);
5942     movl(Address(to, 0), value);
5943     movl(Address(to, 4), value);
5944     addptr(to, 8);
5945     BIND(L_fill_8_bytes);
5946     subl(count, 1 << (shift + 1));
5947     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5948     // fall through to fill 4 bytes
5949   } else {
5950     Label L_fill_32_bytes;
5951     if (!UseUnalignedLoadStores) {
5952       // align to 8 bytes, we know we are 4 byte aligned to start
5953       testptr(to, 4);
5954       jccb(Assembler::zero, L_fill_32_bytes);
5955       movl(Address(to, 0), value);
5956       addptr(to, 4);
5957       subl(count, 1<<shift);
5958     }
5959     BIND(L_fill_32_bytes);
5960     {
5961       assert( UseSSE >= 2, "supported cpu only" );
5962       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5963       movdl(xtmp, value);
5964       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5965         Label L_check_fill_32_bytes;
5966         if (UseAVX > 2) {
5967           // Fill 64-byte chunks
5968           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5969 
5970           // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
5971           cmpl(count, VM_Version::avx3_threshold());
5972           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5973 
5974           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5975 
5976           subl(count, 16 << shift);
5977           jccb(Assembler::less, L_check_fill_32_bytes);
5978           align(16);
5979 
5980           BIND(L_fill_64_bytes_loop_avx3);
5981           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5982           addptr(to, 64);
5983           subl(count, 16 << shift);
5984           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5985           jmpb(L_check_fill_32_bytes);
5986 
5987           BIND(L_check_fill_64_bytes_avx2);
5988         }
5989         // Fill 64-byte chunks
5990         Label L_fill_64_bytes_loop;
5991         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5992 
5993         subl(count, 16 << shift);
5994         jcc(Assembler::less, L_check_fill_32_bytes);
5995         align(16);
5996 
5997         BIND(L_fill_64_bytes_loop);
5998         vmovdqu(Address(to, 0), xtmp);
5999         vmovdqu(Address(to, 32), xtmp);
6000         addptr(to, 64);
6001         subl(count, 16 << shift);
6002         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6003 
6004         BIND(L_check_fill_32_bytes);
6005         addl(count, 8 << shift);
6006         jccb(Assembler::less, L_check_fill_8_bytes);
6007         vmovdqu(Address(to, 0), xtmp);
6008         addptr(to, 32);
6009         subl(count, 8 << shift);
6010 
6011         BIND(L_check_fill_8_bytes);
6012         // clean upper bits of YMM registers
6013         movdl(xtmp, value);
6014         pshufd(xtmp, xtmp, 0);
6015       } else {
6016         // Fill 32-byte chunks
6017         pshufd(xtmp, xtmp, 0);
6018 
6019         subl(count, 8 << shift);
6020         jcc(Assembler::less, L_check_fill_8_bytes);
6021         align(16);
6022 
6023         BIND(L_fill_32_bytes_loop);
6024 
6025         if (UseUnalignedLoadStores) {
6026           movdqu(Address(to, 0), xtmp);
6027           movdqu(Address(to, 16), xtmp);
6028         } else {
6029           movq(Address(to, 0), xtmp);
6030           movq(Address(to, 8), xtmp);
6031           movq(Address(to, 16), xtmp);
6032           movq(Address(to, 24), xtmp);
6033         }
6034 
6035         addptr(to, 32);
6036         subl(count, 8 << shift);
6037         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6038 
6039         BIND(L_check_fill_8_bytes);
6040       }
6041       addl(count, 8 << shift);
6042       jccb(Assembler::zero, L_exit);
6043       jmpb(L_fill_8_bytes);
6044 
6045       //
6046       // length is too short, just fill qwords
6047       //
6048       BIND(L_fill_8_bytes_loop);
6049       movq(Address(to, 0), xtmp);
6050       addptr(to, 8);
6051       BIND(L_fill_8_bytes);
6052       subl(count, 1 << (shift + 1));
6053       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6054     }
6055   }
6056   // fill trailing 4 bytes
6057   BIND(L_fill_4_bytes);
6058   testl(count, 1<<shift);
6059   jccb(Assembler::zero, L_fill_2_bytes);
6060   movl(Address(to, 0), value);
6061   if (t == T_BYTE || t == T_SHORT) {
6062     Label L_fill_byte;
6063     addptr(to, 4);
6064     BIND(L_fill_2_bytes);
6065     // fill trailing 2 bytes
6066     testl(count, 1<<(shift-1));
6067     jccb(Assembler::zero, L_fill_byte);
6068     movw(Address(to, 0), value);
6069     if (t == T_BYTE) {
6070       addptr(to, 2);
6071       BIND(L_fill_byte);
6072       // fill trailing byte
6073       testl(count, 1);
6074       jccb(Assembler::zero, L_exit);
6075       movb(Address(to, 0), value);
6076     } else {
6077       BIND(L_fill_byte);
6078     }
6079   } else {
6080     BIND(L_fill_2_bytes);
6081   }
6082   BIND(L_exit);
6083 }
6084 
6085 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
6086   switch(type) {
6087     case T_BYTE:
6088     case T_BOOLEAN:
6089       evpbroadcastb(dst, src, vector_len);
6090       break;
6091     case T_SHORT:
6092     case T_CHAR:
6093       evpbroadcastw(dst, src, vector_len);
6094       break;
6095     case T_INT:
6096     case T_FLOAT:
6097       evpbroadcastd(dst, src, vector_len);
6098       break;
6099     case T_LONG:
6100     case T_DOUBLE:
6101       evpbroadcastq(dst, src, vector_len);
6102       break;
6103     default:
6104       fatal("Unhandled type : %s", type2name(type));
6105       break;
6106   }
6107 }
6108 
6109 // encode char[] to byte[] in ISO_8859_1 or ASCII
6110    //@IntrinsicCandidate
6111    //private static int implEncodeISOArray(byte[] sa, int sp,
6112    //byte[] da, int dp, int len) {
6113    //  int i = 0;
6114    //  for (; i < len; i++) {
6115    //    char c = StringUTF16.getChar(sa, sp++);
6116    //    if (c > '\u00FF')
6117    //      break;
6118    //    da[dp++] = (byte)c;
6119    //  }
6120    //  return i;
6121    //}
6122    //
6123    //@IntrinsicCandidate
6124    //private static int implEncodeAsciiArray(char[] sa, int sp,
6125    //    byte[] da, int dp, int len) {
6126    //  int i = 0;
6127    //  for (; i < len; i++) {
6128    //    char c = sa[sp++];
6129    //    if (c >= '\u0080')
6130    //      break;
6131    //    da[dp++] = (byte)c;
6132    //  }
6133    //  return i;
6134    //}
6135 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6136   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6137   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6138   Register tmp5, Register result, bool ascii) {
6139 
6140   // rsi: src
6141   // rdi: dst
6142   // rdx: len
6143   // rcx: tmp5
6144   // rax: result
6145   ShortBranchVerifier sbv(this);
6146   assert_different_registers(src, dst, len, tmp5, result);
6147   Label L_done, L_copy_1_char, L_copy_1_char_exit;
6148 
6149   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
6150   int short_mask = ascii ? 0xff80 : 0xff00;
6151 
6152   // set result
6153   xorl(result, result);
6154   // check for zero length
6155   testl(len, len);
6156   jcc(Assembler::zero, L_done);
6157 
6158   movl(result, len);
6159 
6160   // Setup pointers
6161   lea(src, Address(src, len, Address::times_2)); // char[]
6162   lea(dst, Address(dst, len, Address::times_1)); // byte[]
6163   negptr(len);
6164 
6165   if (UseSSE42Intrinsics || UseAVX >= 2) {
6166     Label L_copy_8_chars, L_copy_8_chars_exit;
6167     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6168 
6169     if (UseAVX >= 2) {
6170       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6171       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
6172       movdl(tmp1Reg, tmp5);
6173       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
6174       jmp(L_chars_32_check);
6175 
6176       bind(L_copy_32_chars);
6177       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6178       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6179       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6180       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
6181       jccb(Assembler::notZero, L_copy_32_chars_exit);
6182       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6183       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
6184       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6185 
6186       bind(L_chars_32_check);
6187       addptr(len, 32);
6188       jcc(Assembler::lessEqual, L_copy_32_chars);
6189 
6190       bind(L_copy_32_chars_exit);
6191       subptr(len, 16);
6192       jccb(Assembler::greater, L_copy_16_chars_exit);
6193 
6194     } else if (UseSSE42Intrinsics) {
6195       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
6196       movdl(tmp1Reg, tmp5);
6197       pshufd(tmp1Reg, tmp1Reg, 0);
6198       jmpb(L_chars_16_check);
6199     }
6200 
6201     bind(L_copy_16_chars);
6202     if (UseAVX >= 2) {
6203       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6204       vptest(tmp2Reg, tmp1Reg);
6205       jcc(Assembler::notZero, L_copy_16_chars_exit);
6206       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
6207       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
6208     } else {
6209       if (UseAVX > 0) {
6210         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6211         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6212         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
6213       } else {
6214         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6215         por(tmp2Reg, tmp3Reg);
6216         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6217         por(tmp2Reg, tmp4Reg);
6218       }
6219       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
6220       jccb(Assembler::notZero, L_copy_16_chars_exit);
6221       packuswb(tmp3Reg, tmp4Reg);
6222     }
6223     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6224 
6225     bind(L_chars_16_check);
6226     addptr(len, 16);
6227     jcc(Assembler::lessEqual, L_copy_16_chars);
6228 
6229     bind(L_copy_16_chars_exit);
6230     if (UseAVX >= 2) {
6231       // clean upper bits of YMM registers
6232       vpxor(tmp2Reg, tmp2Reg);
6233       vpxor(tmp3Reg, tmp3Reg);
6234       vpxor(tmp4Reg, tmp4Reg);
6235       movdl(tmp1Reg, tmp5);
6236       pshufd(tmp1Reg, tmp1Reg, 0);
6237     }
6238     subptr(len, 8);
6239     jccb(Assembler::greater, L_copy_8_chars_exit);
6240 
6241     bind(L_copy_8_chars);
6242     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6243     ptest(tmp3Reg, tmp1Reg);
6244     jccb(Assembler::notZero, L_copy_8_chars_exit);
6245     packuswb(tmp3Reg, tmp1Reg);
6246     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6247     addptr(len, 8);
6248     jccb(Assembler::lessEqual, L_copy_8_chars);
6249 
6250     bind(L_copy_8_chars_exit);
6251     subptr(len, 8);
6252     jccb(Assembler::zero, L_done);
6253   }
6254 
6255   bind(L_copy_1_char);
6256   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6257   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
6258   jccb(Assembler::notZero, L_copy_1_char_exit);
6259   movb(Address(dst, len, Address::times_1, 0), tmp5);
6260   addptr(len, 1);
6261   jccb(Assembler::less, L_copy_1_char);
6262 
6263   bind(L_copy_1_char_exit);
6264   addptr(result, len); // len is negative count of not processed elements
6265 
6266   bind(L_done);
6267 }
6268 
6269 #ifdef _LP64
6270 /**
6271  * Helper for multiply_to_len().
6272  */
6273 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6274   addq(dest_lo, src1);
6275   adcq(dest_hi, 0);
6276   addq(dest_lo, src2);
6277   adcq(dest_hi, 0);
6278 }
6279 
6280 /**
6281  * Multiply 64 bit by 64 bit first loop.
6282  */
6283 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6284                                            Register y, Register y_idx, Register z,
6285                                            Register carry, Register product,
6286                                            Register idx, Register kdx) {
6287   //
6288   //  jlong carry, x[], y[], z[];
6289   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6290   //    huge_128 product = y[idx] * x[xstart] + carry;
6291   //    z[kdx] = (jlong)product;
6292   //    carry  = (jlong)(product >>> 64);
6293   //  }
6294   //  z[xstart] = carry;
6295   //
6296 
6297   Label L_first_loop, L_first_loop_exit;
6298   Label L_one_x, L_one_y, L_multiply;
6299 
6300   decrementl(xstart);
6301   jcc(Assembler::negative, L_one_x);
6302 
6303   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6304   rorq(x_xstart, 32); // convert big-endian to little-endian
6305 
6306   bind(L_first_loop);
6307   decrementl(idx);
6308   jcc(Assembler::negative, L_first_loop_exit);
6309   decrementl(idx);
6310   jcc(Assembler::negative, L_one_y);
6311   movq(y_idx, Address(y, idx, Address::times_4,  0));
6312   rorq(y_idx, 32); // convert big-endian to little-endian
6313   bind(L_multiply);
6314   movq(product, x_xstart);
6315   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
6316   addq(product, carry);
6317   adcq(rdx, 0);
6318   subl(kdx, 2);
6319   movl(Address(z, kdx, Address::times_4,  4), product);
6320   shrq(product, 32);
6321   movl(Address(z, kdx, Address::times_4,  0), product);
6322   movq(carry, rdx);
6323   jmp(L_first_loop);
6324 
6325   bind(L_one_y);
6326   movl(y_idx, Address(y,  0));
6327   jmp(L_multiply);
6328 
6329   bind(L_one_x);
6330   movl(x_xstart, Address(x,  0));
6331   jmp(L_first_loop);
6332 
6333   bind(L_first_loop_exit);
6334 }
6335 
6336 /**
6337  * Multiply 64 bit by 64 bit and add 128 bit.
6338  */
6339 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
6340                                             Register yz_idx, Register idx,
6341                                             Register carry, Register product, int offset) {
6342   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
6343   //     z[kdx] = (jlong)product;
6344 
6345   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
6346   rorq(yz_idx, 32); // convert big-endian to little-endian
6347   movq(product, x_xstart);
6348   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
6349   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
6350   rorq(yz_idx, 32); // convert big-endian to little-endian
6351 
6352   add2_with_carry(rdx, product, carry, yz_idx);
6353 
6354   movl(Address(z, idx, Address::times_4,  offset+4), product);
6355   shrq(product, 32);
6356   movl(Address(z, idx, Address::times_4,  offset), product);
6357 
6358 }
6359 
6360 /**
6361  * Multiply 128 bit by 128 bit. Unrolled inner loop.
6362  */
6363 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
6364                                              Register yz_idx, Register idx, Register jdx,
6365                                              Register carry, Register product,
6366                                              Register carry2) {
6367   //   jlong carry, x[], y[], z[];
6368   //   int kdx = ystart+1;
6369   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6370   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
6371   //     z[kdx+idx+1] = (jlong)product;
6372   //     jlong carry2  = (jlong)(product >>> 64);
6373   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6374   //     z[kdx+idx] = (jlong)product;
6375   //     carry  = (jlong)(product >>> 64);
6376   //   }
6377   //   idx += 2;
6378   //   if (idx > 0) {
6379   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6380   //     z[kdx+idx] = (jlong)product;
6381   //     carry  = (jlong)(product >>> 64);
6382   //   }
6383   //
6384 
6385   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6386 
6387   movl(jdx, idx);
6388   andl(jdx, 0xFFFFFFFC);
6389   shrl(jdx, 2);
6390 
6391   bind(L_third_loop);
6392   subl(jdx, 1);
6393   jcc(Assembler::negative, L_third_loop_exit);
6394   subl(idx, 4);
6395 
6396   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6397   movq(carry2, rdx);
6398 
6399   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6400   movq(carry, rdx);
6401   jmp(L_third_loop);
6402 
6403   bind (L_third_loop_exit);
6404 
6405   andl (idx, 0x3);
6406   jcc(Assembler::zero, L_post_third_loop_done);
6407 
6408   Label L_check_1;
6409   subl(idx, 2);
6410   jcc(Assembler::negative, L_check_1);
6411 
6412   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6413   movq(carry, rdx);
6414 
6415   bind (L_check_1);
6416   addl (idx, 0x2);
6417   andl (idx, 0x1);
6418   subl(idx, 1);
6419   jcc(Assembler::negative, L_post_third_loop_done);
6420 
6421   movl(yz_idx, Address(y, idx, Address::times_4,  0));
6422   movq(product, x_xstart);
6423   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6424   movl(yz_idx, Address(z, idx, Address::times_4,  0));
6425 
6426   add2_with_carry(rdx, product, yz_idx, carry);
6427 
6428   movl(Address(z, idx, Address::times_4,  0), product);
6429   shrq(product, 32);
6430 
6431   shlq(rdx, 32);
6432   orq(product, rdx);
6433   movq(carry, product);
6434 
6435   bind(L_post_third_loop_done);
6436 }
6437 
6438 /**
6439  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6440  *
6441  */
6442 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6443                                                   Register carry, Register carry2,
6444                                                   Register idx, Register jdx,
6445                                                   Register yz_idx1, Register yz_idx2,
6446                                                   Register tmp, Register tmp3, Register tmp4) {
6447   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6448 
6449   //   jlong carry, x[], y[], z[];
6450   //   int kdx = ystart+1;
6451   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6452   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6453   //     jlong carry2  = (jlong)(tmp3 >>> 64);
6454   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
6455   //     carry  = (jlong)(tmp4 >>> 64);
6456   //     z[kdx+idx+1] = (jlong)tmp3;
6457   //     z[kdx+idx] = (jlong)tmp4;
6458   //   }
6459   //   idx += 2;
6460   //   if (idx > 0) {
6461   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6462   //     z[kdx+idx] = (jlong)yz_idx1;
6463   //     carry  = (jlong)(yz_idx1 >>> 64);
6464   //   }
6465   //
6466 
6467   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6468 
6469   movl(jdx, idx);
6470   andl(jdx, 0xFFFFFFFC);
6471   shrl(jdx, 2);
6472 
6473   bind(L_third_loop);
6474   subl(jdx, 1);
6475   jcc(Assembler::negative, L_third_loop_exit);
6476   subl(idx, 4);
6477 
6478   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
6479   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6480   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
6481   rorxq(yz_idx2, yz_idx2, 32);
6482 
6483   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
6484   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
6485 
6486   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
6487   rorxq(yz_idx1, yz_idx1, 32);
6488   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6489   rorxq(yz_idx2, yz_idx2, 32);
6490 
6491   if (VM_Version::supports_adx()) {
6492     adcxq(tmp3, carry);
6493     adoxq(tmp3, yz_idx1);
6494 
6495     adcxq(tmp4, tmp);
6496     adoxq(tmp4, yz_idx2);
6497 
6498     movl(carry, 0); // does not affect flags
6499     adcxq(carry2, carry);
6500     adoxq(carry2, carry);
6501   } else {
6502     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6503     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6504   }
6505   movq(carry, carry2);
6506 
6507   movl(Address(z, idx, Address::times_4, 12), tmp3);
6508   shrq(tmp3, 32);
6509   movl(Address(z, idx, Address::times_4,  8), tmp3);
6510 
6511   movl(Address(z, idx, Address::times_4,  4), tmp4);
6512   shrq(tmp4, 32);
6513   movl(Address(z, idx, Address::times_4,  0), tmp4);
6514 
6515   jmp(L_third_loop);
6516 
6517   bind (L_third_loop_exit);
6518 
6519   andl (idx, 0x3);
6520   jcc(Assembler::zero, L_post_third_loop_done);
6521 
6522   Label L_check_1;
6523   subl(idx, 2);
6524   jcc(Assembler::negative, L_check_1);
6525 
6526   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
6527   rorxq(yz_idx1, yz_idx1, 32);
6528   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
6529   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6530   rorxq(yz_idx2, yz_idx2, 32);
6531 
6532   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6533 
6534   movl(Address(z, idx, Address::times_4,  4), tmp3);
6535   shrq(tmp3, 32);
6536   movl(Address(z, idx, Address::times_4,  0), tmp3);
6537   movq(carry, tmp4);
6538 
6539   bind (L_check_1);
6540   addl (idx, 0x2);
6541   andl (idx, 0x1);
6542   subl(idx, 1);
6543   jcc(Assembler::negative, L_post_third_loop_done);
6544   movl(tmp4, Address(y, idx, Address::times_4,  0));
6545   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
6546   movl(tmp4, Address(z, idx, Address::times_4,  0));
6547 
6548   add2_with_carry(carry2, tmp3, tmp4, carry);
6549 
6550   movl(Address(z, idx, Address::times_4,  0), tmp3);
6551   shrq(tmp3, 32);
6552 
6553   shlq(carry2, 32);
6554   orq(tmp3, carry2);
6555   movq(carry, tmp3);
6556 
6557   bind(L_post_third_loop_done);
6558 }
6559 
6560 /**
6561  * Code for BigInteger::multiplyToLen() intrinsic.
6562  *
6563  * rdi: x
6564  * rax: xlen
6565  * rsi: y
6566  * rcx: ylen
6567  * r8:  z
6568  * r11: zlen
6569  * r12: tmp1
6570  * r13: tmp2
6571  * r14: tmp3
6572  * r15: tmp4
6573  * rbx: tmp5
6574  *
6575  */
6576 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6577                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6578   ShortBranchVerifier sbv(this);
6579   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6580 
6581   push(tmp1);
6582   push(tmp2);
6583   push(tmp3);
6584   push(tmp4);
6585   push(tmp5);
6586 
6587   push(xlen);
6588   push(zlen);
6589 
6590   const Register idx = tmp1;
6591   const Register kdx = tmp2;
6592   const Register xstart = tmp3;
6593 
6594   const Register y_idx = tmp4;
6595   const Register carry = tmp5;
6596   const Register product  = xlen;
6597   const Register x_xstart = zlen;  // reuse register
6598 
6599   // First Loop.
6600   //
6601   //  final static long LONG_MASK = 0xffffffffL;
6602   //  int xstart = xlen - 1;
6603   //  int ystart = ylen - 1;
6604   //  long carry = 0;
6605   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6606   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6607   //    z[kdx] = (int)product;
6608   //    carry = product >>> 32;
6609   //  }
6610   //  z[xstart] = (int)carry;
6611   //
6612 
6613   movl(idx, ylen);      // idx = ylen;
6614   movl(kdx, zlen);      // kdx = xlen+ylen;
6615   xorq(carry, carry);   // carry = 0;
6616 
6617   Label L_done;
6618 
6619   movl(xstart, xlen);
6620   decrementl(xstart);
6621   jcc(Assembler::negative, L_done);
6622 
6623   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6624 
6625   Label L_second_loop;
6626   testl(kdx, kdx);
6627   jcc(Assembler::zero, L_second_loop);
6628 
6629   Label L_carry;
6630   subl(kdx, 1);
6631   jcc(Assembler::zero, L_carry);
6632 
6633   movl(Address(z, kdx, Address::times_4,  0), carry);
6634   shrq(carry, 32);
6635   subl(kdx, 1);
6636 
6637   bind(L_carry);
6638   movl(Address(z, kdx, Address::times_4,  0), carry);
6639 
6640   // Second and third (nested) loops.
6641   //
6642   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6643   //   carry = 0;
6644   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6645   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6646   //                    (z[k] & LONG_MASK) + carry;
6647   //     z[k] = (int)product;
6648   //     carry = product >>> 32;
6649   //   }
6650   //   z[i] = (int)carry;
6651   // }
6652   //
6653   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6654 
6655   const Register jdx = tmp1;
6656 
6657   bind(L_second_loop);
6658   xorl(carry, carry);    // carry = 0;
6659   movl(jdx, ylen);       // j = ystart+1
6660 
6661   subl(xstart, 1);       // i = xstart-1;
6662   jcc(Assembler::negative, L_done);
6663 
6664   push (z);
6665 
6666   Label L_last_x;
6667   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6668   subl(xstart, 1);       // i = xstart-1;
6669   jcc(Assembler::negative, L_last_x);
6670 
6671   if (UseBMI2Instructions) {
6672     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6673     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6674   } else {
6675     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6676     rorq(x_xstart, 32);  // convert big-endian to little-endian
6677   }
6678 
6679   Label L_third_loop_prologue;
6680   bind(L_third_loop_prologue);
6681 
6682   push (x);
6683   push (xstart);
6684   push (ylen);
6685 
6686 
6687   if (UseBMI2Instructions) {
6688     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6689   } else { // !UseBMI2Instructions
6690     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6691   }
6692 
6693   pop(ylen);
6694   pop(xlen);
6695   pop(x);
6696   pop(z);
6697 
6698   movl(tmp3, xlen);
6699   addl(tmp3, 1);
6700   movl(Address(z, tmp3, Address::times_4,  0), carry);
6701   subl(tmp3, 1);
6702   jccb(Assembler::negative, L_done);
6703 
6704   shrq(carry, 32);
6705   movl(Address(z, tmp3, Address::times_4,  0), carry);
6706   jmp(L_second_loop);
6707 
6708   // Next infrequent code is moved outside loops.
6709   bind(L_last_x);
6710   if (UseBMI2Instructions) {
6711     movl(rdx, Address(x,  0));
6712   } else {
6713     movl(x_xstart, Address(x,  0));
6714   }
6715   jmp(L_third_loop_prologue);
6716 
6717   bind(L_done);
6718 
6719   pop(zlen);
6720   pop(xlen);
6721 
6722   pop(tmp5);
6723   pop(tmp4);
6724   pop(tmp3);
6725   pop(tmp2);
6726   pop(tmp1);
6727 }
6728 
6729 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6730   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6731   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6732   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6733   Label VECTOR8_TAIL, VECTOR4_TAIL;
6734   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6735   Label SAME_TILL_END, DONE;
6736   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6737 
6738   //scale is in rcx in both Win64 and Unix
6739   ShortBranchVerifier sbv(this);
6740 
6741   shlq(length);
6742   xorq(result, result);
6743 
6744   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6745       VM_Version::supports_avx512vlbw()) {
6746     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6747 
6748     cmpq(length, 64);
6749     jcc(Assembler::less, VECTOR32_TAIL);
6750 
6751     movq(tmp1, length);
6752     andq(tmp1, 0x3F);      // tail count
6753     andq(length, ~(0x3F)); //vector count
6754 
6755     bind(VECTOR64_LOOP);
6756     // AVX512 code to compare 64 byte vectors.
6757     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
6758     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6759     kortestql(k7, k7);
6760     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6761     addq(result, 64);
6762     subq(length, 64);
6763     jccb(Assembler::notZero, VECTOR64_LOOP);
6764 
6765     //bind(VECTOR64_TAIL);
6766     testq(tmp1, tmp1);
6767     jcc(Assembler::zero, SAME_TILL_END);
6768 
6769     //bind(VECTOR64_TAIL);
6770     // AVX512 code to compare up to 63 byte vectors.
6771     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6772     shlxq(tmp2, tmp2, tmp1);
6773     notq(tmp2);
6774     kmovql(k3, tmp2);
6775 
6776     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6777     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6778 
6779     ktestql(k7, k3);
6780     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6781 
6782     bind(VECTOR64_NOT_EQUAL);
6783     kmovql(tmp1, k7);
6784     notq(tmp1);
6785     tzcntq(tmp1, tmp1);
6786     addq(result, tmp1);
6787     shrq(result);
6788     jmp(DONE);
6789     bind(VECTOR32_TAIL);
6790   }
6791 
6792   cmpq(length, 8);
6793   jcc(Assembler::equal, VECTOR8_LOOP);
6794   jcc(Assembler::less, VECTOR4_TAIL);
6795 
6796   if (UseAVX >= 2) {
6797     Label VECTOR16_TAIL, VECTOR32_LOOP;
6798 
6799     cmpq(length, 16);
6800     jcc(Assembler::equal, VECTOR16_LOOP);
6801     jcc(Assembler::less, VECTOR8_LOOP);
6802 
6803     cmpq(length, 32);
6804     jccb(Assembler::less, VECTOR16_TAIL);
6805 
6806     subq(length, 32);
6807     bind(VECTOR32_LOOP);
6808     vmovdqu(rymm0, Address(obja, result));
6809     vmovdqu(rymm1, Address(objb, result));
6810     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6811     vptest(rymm2, rymm2);
6812     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6813     addq(result, 32);
6814     subq(length, 32);
6815     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6816     addq(length, 32);
6817     jcc(Assembler::equal, SAME_TILL_END);
6818     //falling through if less than 32 bytes left //close the branch here.
6819 
6820     bind(VECTOR16_TAIL);
6821     cmpq(length, 16);
6822     jccb(Assembler::less, VECTOR8_TAIL);
6823     bind(VECTOR16_LOOP);
6824     movdqu(rymm0, Address(obja, result));
6825     movdqu(rymm1, Address(objb, result));
6826     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6827     ptest(rymm2, rymm2);
6828     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6829     addq(result, 16);
6830     subq(length, 16);
6831     jcc(Assembler::equal, SAME_TILL_END);
6832     //falling through if less than 16 bytes left
6833   } else {//regular intrinsics
6834 
6835     cmpq(length, 16);
6836     jccb(Assembler::less, VECTOR8_TAIL);
6837 
6838     subq(length, 16);
6839     bind(VECTOR16_LOOP);
6840     movdqu(rymm0, Address(obja, result));
6841     movdqu(rymm1, Address(objb, result));
6842     pxor(rymm0, rymm1);
6843     ptest(rymm0, rymm0);
6844     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6845     addq(result, 16);
6846     subq(length, 16);
6847     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6848     addq(length, 16);
6849     jcc(Assembler::equal, SAME_TILL_END);
6850     //falling through if less than 16 bytes left
6851   }
6852 
6853   bind(VECTOR8_TAIL);
6854   cmpq(length, 8);
6855   jccb(Assembler::less, VECTOR4_TAIL);
6856   bind(VECTOR8_LOOP);
6857   movq(tmp1, Address(obja, result));
6858   movq(tmp2, Address(objb, result));
6859   xorq(tmp1, tmp2);
6860   testq(tmp1, tmp1);
6861   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6862   addq(result, 8);
6863   subq(length, 8);
6864   jcc(Assembler::equal, SAME_TILL_END);
6865   //falling through if less than 8 bytes left
6866 
6867   bind(VECTOR4_TAIL);
6868   cmpq(length, 4);
6869   jccb(Assembler::less, BYTES_TAIL);
6870   bind(VECTOR4_LOOP);
6871   movl(tmp1, Address(obja, result));
6872   xorl(tmp1, Address(objb, result));
6873   testl(tmp1, tmp1);
6874   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6875   addq(result, 4);
6876   subq(length, 4);
6877   jcc(Assembler::equal, SAME_TILL_END);
6878   //falling through if less than 4 bytes left
6879 
6880   bind(BYTES_TAIL);
6881   bind(BYTES_LOOP);
6882   load_unsigned_byte(tmp1, Address(obja, result));
6883   load_unsigned_byte(tmp2, Address(objb, result));
6884   xorl(tmp1, tmp2);
6885   testl(tmp1, tmp1);
6886   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6887   decq(length);
6888   jcc(Assembler::zero, SAME_TILL_END);
6889   incq(result);
6890   load_unsigned_byte(tmp1, Address(obja, result));
6891   load_unsigned_byte(tmp2, Address(objb, result));
6892   xorl(tmp1, tmp2);
6893   testl(tmp1, tmp1);
6894   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6895   decq(length);
6896   jcc(Assembler::zero, SAME_TILL_END);
6897   incq(result);
6898   load_unsigned_byte(tmp1, Address(obja, result));
6899   load_unsigned_byte(tmp2, Address(objb, result));
6900   xorl(tmp1, tmp2);
6901   testl(tmp1, tmp1);
6902   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6903   jmp(SAME_TILL_END);
6904 
6905   if (UseAVX >= 2) {
6906     bind(VECTOR32_NOT_EQUAL);
6907     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6908     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6909     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6910     vpmovmskb(tmp1, rymm0);
6911     bsfq(tmp1, tmp1);
6912     addq(result, tmp1);
6913     shrq(result);
6914     jmp(DONE);
6915   }
6916 
6917   bind(VECTOR16_NOT_EQUAL);
6918   if (UseAVX >= 2) {
6919     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6920     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6921     pxor(rymm0, rymm2);
6922   } else {
6923     pcmpeqb(rymm2, rymm2);
6924     pxor(rymm0, rymm1);
6925     pcmpeqb(rymm0, rymm1);
6926     pxor(rymm0, rymm2);
6927   }
6928   pmovmskb(tmp1, rymm0);
6929   bsfq(tmp1, tmp1);
6930   addq(result, tmp1);
6931   shrq(result);
6932   jmpb(DONE);
6933 
6934   bind(VECTOR8_NOT_EQUAL);
6935   bind(VECTOR4_NOT_EQUAL);
6936   bsfq(tmp1, tmp1);
6937   shrq(tmp1, 3);
6938   addq(result, tmp1);
6939   bind(BYTES_NOT_EQUAL);
6940   shrq(result);
6941   jmpb(DONE);
6942 
6943   bind(SAME_TILL_END);
6944   mov64(result, -1);
6945 
6946   bind(DONE);
6947 }
6948 
6949 //Helper functions for square_to_len()
6950 
6951 /**
6952  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6953  * Preserves x and z and modifies rest of the registers.
6954  */
6955 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6956   // Perform square and right shift by 1
6957   // Handle odd xlen case first, then for even xlen do the following
6958   // jlong carry = 0;
6959   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6960   //     huge_128 product = x[j:j+1] * x[j:j+1];
6961   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6962   //     z[i+2:i+3] = (jlong)(product >>> 1);
6963   //     carry = (jlong)product;
6964   // }
6965 
6966   xorq(tmp5, tmp5);     // carry
6967   xorq(rdxReg, rdxReg);
6968   xorl(tmp1, tmp1);     // index for x
6969   xorl(tmp4, tmp4);     // index for z
6970 
6971   Label L_first_loop, L_first_loop_exit;
6972 
6973   testl(xlen, 1);
6974   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6975 
6976   // Square and right shift by 1 the odd element using 32 bit multiply
6977   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6978   imulq(raxReg, raxReg);
6979   shrq(raxReg, 1);
6980   adcq(tmp5, 0);
6981   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6982   incrementl(tmp1);
6983   addl(tmp4, 2);
6984 
6985   // Square and  right shift by 1 the rest using 64 bit multiply
6986   bind(L_first_loop);
6987   cmpptr(tmp1, xlen);
6988   jccb(Assembler::equal, L_first_loop_exit);
6989 
6990   // Square
6991   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6992   rorq(raxReg, 32);    // convert big-endian to little-endian
6993   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6994 
6995   // Right shift by 1 and save carry
6996   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6997   rcrq(rdxReg, 1);
6998   rcrq(raxReg, 1);
6999   adcq(tmp5, 0);
7000 
7001   // Store result in z
7002   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7003   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7004 
7005   // Update indices for x and z
7006   addl(tmp1, 2);
7007   addl(tmp4, 4);
7008   jmp(L_first_loop);
7009 
7010   bind(L_first_loop_exit);
7011 }
7012 
7013 
7014 /**
7015  * Perform the following multiply add operation using BMI2 instructions
7016  * carry:sum = sum + op1*op2 + carry
7017  * op2 should be in rdx
7018  * op2 is preserved, all other registers are modified
7019  */
7020 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7021   // assert op2 is rdx
7022   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
7023   addq(sum, carry);
7024   adcq(tmp2, 0);
7025   addq(sum, op1);
7026   adcq(tmp2, 0);
7027   movq(carry, tmp2);
7028 }
7029 
7030 /**
7031  * Perform the following multiply add operation:
7032  * carry:sum = sum + op1*op2 + carry
7033  * Preserves op1, op2 and modifies rest of registers
7034  */
7035 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7036   // rdx:rax = op1 * op2
7037   movq(raxReg, op2);
7038   mulq(op1);
7039 
7040   //  rdx:rax = sum + carry + rdx:rax
7041   addq(sum, carry);
7042   adcq(rdxReg, 0);
7043   addq(sum, raxReg);
7044   adcq(rdxReg, 0);
7045 
7046   // carry:sum = rdx:sum
7047   movq(carry, rdxReg);
7048 }
7049 
7050 /**
7051  * Add 64 bit long carry into z[] with carry propagation.
7052  * Preserves z and carry register values and modifies rest of registers.
7053  *
7054  */
7055 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7056   Label L_fourth_loop, L_fourth_loop_exit;
7057 
7058   movl(tmp1, 1);
7059   subl(zlen, 2);
7060   addq(Address(z, zlen, Address::times_4, 0), carry);
7061 
7062   bind(L_fourth_loop);
7063   jccb(Assembler::carryClear, L_fourth_loop_exit);
7064   subl(zlen, 2);
7065   jccb(Assembler::negative, L_fourth_loop_exit);
7066   addq(Address(z, zlen, Address::times_4, 0), tmp1);
7067   jmp(L_fourth_loop);
7068   bind(L_fourth_loop_exit);
7069 }
7070 
7071 /**
7072  * Shift z[] left by 1 bit.
7073  * Preserves x, len, z and zlen registers and modifies rest of the registers.
7074  *
7075  */
7076 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7077 
7078   Label L_fifth_loop, L_fifth_loop_exit;
7079 
7080   // Fifth loop
7081   // Perform primitiveLeftShift(z, zlen, 1)
7082 
7083   const Register prev_carry = tmp1;
7084   const Register new_carry = tmp4;
7085   const Register value = tmp2;
7086   const Register zidx = tmp3;
7087 
7088   // int zidx, carry;
7089   // long value;
7090   // carry = 0;
7091   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7092   //    (carry:value)  = (z[i] << 1) | carry ;
7093   //    z[i] = value;
7094   // }
7095 
7096   movl(zidx, zlen);
7097   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7098 
7099   bind(L_fifth_loop);
7100   decl(zidx);  // Use decl to preserve carry flag
7101   decl(zidx);
7102   jccb(Assembler::negative, L_fifth_loop_exit);
7103 
7104   if (UseBMI2Instructions) {
7105      movq(value, Address(z, zidx, Address::times_4, 0));
7106      rclq(value, 1);
7107      rorxq(value, value, 32);
7108      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7109   }
7110   else {
7111     // clear new_carry
7112     xorl(new_carry, new_carry);
7113 
7114     // Shift z[i] by 1, or in previous carry and save new carry
7115     movq(value, Address(z, zidx, Address::times_4, 0));
7116     shlq(value, 1);
7117     adcl(new_carry, 0);
7118 
7119     orq(value, prev_carry);
7120     rorq(value, 0x20);
7121     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7122 
7123     // Set previous carry = new carry
7124     movl(prev_carry, new_carry);
7125   }
7126   jmp(L_fifth_loop);
7127 
7128   bind(L_fifth_loop_exit);
7129 }
7130 
7131 
7132 /**
7133  * Code for BigInteger::squareToLen() intrinsic
7134  *
7135  * rdi: x
7136  * rsi: len
7137  * r8:  z
7138  * rcx: zlen
7139  * r12: tmp1
7140  * r13: tmp2
7141  * r14: tmp3
7142  * r15: tmp4
7143  * rbx: tmp5
7144  *
7145  */
7146 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7147 
7148   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
7149   push(tmp1);
7150   push(tmp2);
7151   push(tmp3);
7152   push(tmp4);
7153   push(tmp5);
7154 
7155   // First loop
7156   // Store the squares, right shifted one bit (i.e., divided by 2).
7157   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7158 
7159   // Add in off-diagonal sums.
7160   //
7161   // Second, third (nested) and fourth loops.
7162   // zlen +=2;
7163   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7164   //    carry = 0;
7165   //    long op2 = x[xidx:xidx+1];
7166   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7167   //       k -= 2;
7168   //       long op1 = x[j:j+1];
7169   //       long sum = z[k:k+1];
7170   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7171   //       z[k:k+1] = sum;
7172   //    }
7173   //    add_one_64(z, k, carry, tmp_regs);
7174   // }
7175 
7176   const Register carry = tmp5;
7177   const Register sum = tmp3;
7178   const Register op1 = tmp4;
7179   Register op2 = tmp2;
7180 
7181   push(zlen);
7182   push(len);
7183   addl(zlen,2);
7184   bind(L_second_loop);
7185   xorq(carry, carry);
7186   subl(zlen, 4);
7187   subl(len, 2);
7188   push(zlen);
7189   push(len);
7190   cmpl(len, 0);
7191   jccb(Assembler::lessEqual, L_second_loop_exit);
7192 
7193   // Multiply an array by one 64 bit long.
7194   if (UseBMI2Instructions) {
7195     op2 = rdxReg;
7196     movq(op2, Address(x, len, Address::times_4,  0));
7197     rorxq(op2, op2, 32);
7198   }
7199   else {
7200     movq(op2, Address(x, len, Address::times_4,  0));
7201     rorq(op2, 32);
7202   }
7203 
7204   bind(L_third_loop);
7205   decrementl(len);
7206   jccb(Assembler::negative, L_third_loop_exit);
7207   decrementl(len);
7208   jccb(Assembler::negative, L_last_x);
7209 
7210   movq(op1, Address(x, len, Address::times_4,  0));
7211   rorq(op1, 32);
7212 
7213   bind(L_multiply);
7214   subl(zlen, 2);
7215   movq(sum, Address(z, zlen, Address::times_4,  0));
7216 
7217   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
7218   if (UseBMI2Instructions) {
7219     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
7220   }
7221   else {
7222     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7223   }
7224 
7225   movq(Address(z, zlen, Address::times_4, 0), sum);
7226 
7227   jmp(L_third_loop);
7228   bind(L_third_loop_exit);
7229 
7230   // Fourth loop
7231   // Add 64 bit long carry into z with carry propagation.
7232   // Uses offsetted zlen.
7233   add_one_64(z, zlen, carry, tmp1);
7234 
7235   pop(len);
7236   pop(zlen);
7237   jmp(L_second_loop);
7238 
7239   // Next infrequent code is moved outside loops.
7240   bind(L_last_x);
7241   movl(op1, Address(x, 0));
7242   jmp(L_multiply);
7243 
7244   bind(L_second_loop_exit);
7245   pop(len);
7246   pop(zlen);
7247   pop(len);
7248   pop(zlen);
7249 
7250   // Fifth loop
7251   // Shift z left 1 bit.
7252   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7253 
7254   // z[zlen-1] |= x[len-1] & 1;
7255   movl(tmp3, Address(x, len, Address::times_4, -4));
7256   andl(tmp3, 1);
7257   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
7258 
7259   pop(tmp5);
7260   pop(tmp4);
7261   pop(tmp3);
7262   pop(tmp2);
7263   pop(tmp1);
7264 }
7265 
7266 /**
7267  * Helper function for mul_add()
7268  * Multiply the in[] by int k and add to out[] starting at offset offs using
7269  * 128 bit by 32 bit multiply and return the carry in tmp5.
7270  * Only quad int aligned length of in[] is operated on in this function.
7271  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7272  * This function preserves out, in and k registers.
7273  * len and offset point to the appropriate index in "in" & "out" correspondingly
7274  * tmp5 has the carry.
7275  * other registers are temporary and are modified.
7276  *
7277  */
7278 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7279   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7280   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7281 
7282   Label L_first_loop, L_first_loop_exit;
7283 
7284   movl(tmp1, len);
7285   shrl(tmp1, 2);
7286 
7287   bind(L_first_loop);
7288   subl(tmp1, 1);
7289   jccb(Assembler::negative, L_first_loop_exit);
7290 
7291   subl(len, 4);
7292   subl(offset, 4);
7293 
7294   Register op2 = tmp2;
7295   const Register sum = tmp3;
7296   const Register op1 = tmp4;
7297   const Register carry = tmp5;
7298 
7299   if (UseBMI2Instructions) {
7300     op2 = rdxReg;
7301   }
7302 
7303   movq(op1, Address(in, len, Address::times_4,  8));
7304   rorq(op1, 32);
7305   movq(sum, Address(out, offset, Address::times_4,  8));
7306   rorq(sum, 32);
7307   if (UseBMI2Instructions) {
7308     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7309   }
7310   else {
7311     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7312   }
7313   // Store back in big endian from little endian
7314   rorq(sum, 0x20);
7315   movq(Address(out, offset, Address::times_4,  8), sum);
7316 
7317   movq(op1, Address(in, len, Address::times_4,  0));
7318   rorq(op1, 32);
7319   movq(sum, Address(out, offset, Address::times_4,  0));
7320   rorq(sum, 32);
7321   if (UseBMI2Instructions) {
7322     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7323   }
7324   else {
7325     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7326   }
7327   // Store back in big endian from little endian
7328   rorq(sum, 0x20);
7329   movq(Address(out, offset, Address::times_4,  0), sum);
7330 
7331   jmp(L_first_loop);
7332   bind(L_first_loop_exit);
7333 }
7334 
7335 /**
7336  * Code for BigInteger::mulAdd() intrinsic
7337  *
7338  * rdi: out
7339  * rsi: in
7340  * r11: offs (out.length - offset)
7341  * rcx: len
7342  * r8:  k
7343  * r12: tmp1
7344  * r13: tmp2
7345  * r14: tmp3
7346  * r15: tmp4
7347  * rbx: tmp5
7348  * Multiply the in[] by word k and add to out[], return the carry in rax
7349  */
7350 void MacroAssembler::mul_add(Register out, Register in, Register offs,
7351    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
7352    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7353 
7354   Label L_carry, L_last_in, L_done;
7355 
7356 // carry = 0;
7357 // for (int j=len-1; j >= 0; j--) {
7358 //    long product = (in[j] & LONG_MASK) * kLong +
7359 //                   (out[offs] & LONG_MASK) + carry;
7360 //    out[offs--] = (int)product;
7361 //    carry = product >>> 32;
7362 // }
7363 //
7364   push(tmp1);
7365   push(tmp2);
7366   push(tmp3);
7367   push(tmp4);
7368   push(tmp5);
7369 
7370   Register op2 = tmp2;
7371   const Register sum = tmp3;
7372   const Register op1 = tmp4;
7373   const Register carry =  tmp5;
7374 
7375   if (UseBMI2Instructions) {
7376     op2 = rdxReg;
7377     movl(op2, k);
7378   }
7379   else {
7380     movl(op2, k);
7381   }
7382 
7383   xorq(carry, carry);
7384 
7385   //First loop
7386 
7387   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7388   //The carry is in tmp5
7389   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7390 
7391   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7392   decrementl(len);
7393   jccb(Assembler::negative, L_carry);
7394   decrementl(len);
7395   jccb(Assembler::negative, L_last_in);
7396 
7397   movq(op1, Address(in, len, Address::times_4,  0));
7398   rorq(op1, 32);
7399 
7400   subl(offs, 2);
7401   movq(sum, Address(out, offs, Address::times_4,  0));
7402   rorq(sum, 32);
7403 
7404   if (UseBMI2Instructions) {
7405     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7406   }
7407   else {
7408     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7409   }
7410 
7411   // Store back in big endian from little endian
7412   rorq(sum, 0x20);
7413   movq(Address(out, offs, Address::times_4,  0), sum);
7414 
7415   testl(len, len);
7416   jccb(Assembler::zero, L_carry);
7417 
7418   //Multiply the last in[] entry, if any
7419   bind(L_last_in);
7420   movl(op1, Address(in, 0));
7421   movl(sum, Address(out, offs, Address::times_4,  -4));
7422 
7423   movl(raxReg, k);
7424   mull(op1); //tmp4 * eax -> edx:eax
7425   addl(sum, carry);
7426   adcl(rdxReg, 0);
7427   addl(sum, raxReg);
7428   adcl(rdxReg, 0);
7429   movl(carry, rdxReg);
7430 
7431   movl(Address(out, offs, Address::times_4,  -4), sum);
7432 
7433   bind(L_carry);
7434   //return tmp5/carry as carry in rax
7435   movl(rax, carry);
7436 
7437   bind(L_done);
7438   pop(tmp5);
7439   pop(tmp4);
7440   pop(tmp3);
7441   pop(tmp2);
7442   pop(tmp1);
7443 }
7444 #endif
7445 
7446 /**
7447  * Emits code to update CRC-32 with a byte value according to constants in table
7448  *
7449  * @param [in,out]crc   Register containing the crc.
7450  * @param [in]val       Register containing the byte to fold into the CRC.
7451  * @param [in]table     Register containing the table of crc constants.
7452  *
7453  * uint32_t crc;
7454  * val = crc_table[(val ^ crc) & 0xFF];
7455  * crc = val ^ (crc >> 8);
7456  *
7457  */
7458 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7459   xorl(val, crc);
7460   andl(val, 0xFF);
7461   shrl(crc, 8); // unsigned shift
7462   xorl(crc, Address(table, val, Address::times_4, 0));
7463 }
7464 
7465 /**
7466  * Fold 128-bit data chunk
7467  */
7468 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7469   if (UseAVX > 0) {
7470     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7471     vpclmulldq(xcrc, xK, xcrc); // [63:0]
7472     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7473     pxor(xcrc, xtmp);
7474   } else {
7475     movdqa(xtmp, xcrc);
7476     pclmulhdq(xtmp, xK);   // [123:64]
7477     pclmulldq(xcrc, xK);   // [63:0]
7478     pxor(xcrc, xtmp);
7479     movdqu(xtmp, Address(buf, offset));
7480     pxor(xcrc, xtmp);
7481   }
7482 }
7483 
7484 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7485   if (UseAVX > 0) {
7486     vpclmulhdq(xtmp, xK, xcrc);
7487     vpclmulldq(xcrc, xK, xcrc);
7488     pxor(xcrc, xbuf);
7489     pxor(xcrc, xtmp);
7490   } else {
7491     movdqa(xtmp, xcrc);
7492     pclmulhdq(xtmp, xK);
7493     pclmulldq(xcrc, xK);
7494     pxor(xcrc, xbuf);
7495     pxor(xcrc, xtmp);
7496   }
7497 }
7498 
7499 /**
7500  * 8-bit folds to compute 32-bit CRC
7501  *
7502  * uint64_t xcrc;
7503  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7504  */
7505 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7506   movdl(tmp, xcrc);
7507   andl(tmp, 0xFF);
7508   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7509   psrldq(xcrc, 1); // unsigned shift one byte
7510   pxor(xcrc, xtmp);
7511 }
7512 
7513 /**
7514  * uint32_t crc;
7515  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7516  */
7517 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7518   movl(tmp, crc);
7519   andl(tmp, 0xFF);
7520   shrl(crc, 8);
7521   xorl(crc, Address(table, tmp, Address::times_4, 0));
7522 }
7523 
7524 /**
7525  * @param crc   register containing existing CRC (32-bit)
7526  * @param buf   register pointing to input byte buffer (byte*)
7527  * @param len   register containing number of bytes
7528  * @param table register that will contain address of CRC table
7529  * @param tmp   scratch register
7530  */
7531 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7532   assert_different_registers(crc, buf, len, table, tmp, rax);
7533 
7534   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7535   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7536 
7537   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7538   // context for the registers used, where all instructions below are using 128-bit mode
7539   // On EVEX without VL and BW, these instructions will all be AVX.
7540   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7541   notl(crc); // ~crc
7542   cmpl(len, 16);
7543   jcc(Assembler::less, L_tail);
7544 
7545   // Align buffer to 16 bytes
7546   movl(tmp, buf);
7547   andl(tmp, 0xF);
7548   jccb(Assembler::zero, L_aligned);
7549   subl(tmp,  16);
7550   addl(len, tmp);
7551 
7552   align(4);
7553   BIND(L_align_loop);
7554   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7555   update_byte_crc32(crc, rax, table);
7556   increment(buf);
7557   incrementl(tmp);
7558   jccb(Assembler::less, L_align_loop);
7559 
7560   BIND(L_aligned);
7561   movl(tmp, len); // save
7562   shrl(len, 4);
7563   jcc(Assembler::zero, L_tail_restore);
7564 
7565   // Fold crc into first bytes of vector
7566   movdqa(xmm1, Address(buf, 0));
7567   movdl(rax, xmm1);
7568   xorl(crc, rax);
7569   if (VM_Version::supports_sse4_1()) {
7570     pinsrd(xmm1, crc, 0);
7571   } else {
7572     pinsrw(xmm1, crc, 0);
7573     shrl(crc, 16);
7574     pinsrw(xmm1, crc, 1);
7575   }
7576   addptr(buf, 16);
7577   subl(len, 4); // len > 0
7578   jcc(Assembler::less, L_fold_tail);
7579 
7580   movdqa(xmm2, Address(buf,  0));
7581   movdqa(xmm3, Address(buf, 16));
7582   movdqa(xmm4, Address(buf, 32));
7583   addptr(buf, 48);
7584   subl(len, 3);
7585   jcc(Assembler::lessEqual, L_fold_512b);
7586 
7587   // Fold total 512 bits of polynomial on each iteration,
7588   // 128 bits per each of 4 parallel streams.
7589   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
7590 
7591   align32();
7592   BIND(L_fold_512b_loop);
7593   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7594   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7595   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7596   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7597   addptr(buf, 64);
7598   subl(len, 4);
7599   jcc(Assembler::greater, L_fold_512b_loop);
7600 
7601   // Fold 512 bits to 128 bits.
7602   BIND(L_fold_512b);
7603   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7604   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7605   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7606   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7607 
7608   // Fold the rest of 128 bits data chunks
7609   BIND(L_fold_tail);
7610   addl(len, 3);
7611   jccb(Assembler::lessEqual, L_fold_128b);
7612   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7613 
7614   BIND(L_fold_tail_loop);
7615   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7616   addptr(buf, 16);
7617   decrementl(len);
7618   jccb(Assembler::greater, L_fold_tail_loop);
7619 
7620   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7621   BIND(L_fold_128b);
7622   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
7623   if (UseAVX > 0) {
7624     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7625     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7626     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7627   } else {
7628     movdqa(xmm2, xmm0);
7629     pclmulqdq(xmm2, xmm1, 0x1);
7630     movdqa(xmm3, xmm0);
7631     pand(xmm3, xmm2);
7632     pclmulqdq(xmm0, xmm3, 0x1);
7633   }
7634   psrldq(xmm1, 8);
7635   psrldq(xmm2, 4);
7636   pxor(xmm0, xmm1);
7637   pxor(xmm0, xmm2);
7638 
7639   // 8 8-bit folds to compute 32-bit CRC.
7640   for (int j = 0; j < 4; j++) {
7641     fold_8bit_crc32(xmm0, table, xmm1, rax);
7642   }
7643   movdl(crc, xmm0); // mov 32 bits to general register
7644   for (int j = 0; j < 4; j++) {
7645     fold_8bit_crc32(crc, table, rax);
7646   }
7647 
7648   BIND(L_tail_restore);
7649   movl(len, tmp); // restore
7650   BIND(L_tail);
7651   andl(len, 0xf);
7652   jccb(Assembler::zero, L_exit);
7653 
7654   // Fold the rest of bytes
7655   align(4);
7656   BIND(L_tail_loop);
7657   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7658   update_byte_crc32(crc, rax, table);
7659   increment(buf);
7660   decrementl(len);
7661   jccb(Assembler::greater, L_tail_loop);
7662 
7663   BIND(L_exit);
7664   notl(crc); // ~c
7665 }
7666 
7667 #ifdef _LP64
7668 // Helper function for AVX 512 CRC32
7669 // Fold 512-bit data chunks
7670 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7671                                              Register pos, int offset) {
7672   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7673   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7674   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7675   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7676   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7677 }
7678 
7679 // Helper function for AVX 512 CRC32
7680 // Compute CRC32 for < 256B buffers
7681 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7682                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7683                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7684 
7685   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7686   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7687   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7688 
7689   // check if there is enough buffer to be able to fold 16B at a time
7690   cmpl(len, 32);
7691   jcc(Assembler::less, L_less_than_32);
7692 
7693   // if there is, load the constants
7694   movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
7695   movdl(xmm0, crc);                        // get the initial crc value
7696   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7697   pxor(xmm7, xmm0);
7698 
7699   // update the buffer pointer
7700   addl(pos, 16);
7701   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7702   subl(len, 32);
7703   jmp(L_16B_reduction_loop);
7704 
7705   bind(L_less_than_32);
7706   //mov initial crc to the return value. this is necessary for zero - length buffers.
7707   movl(rax, crc);
7708   testl(len, len);
7709   jcc(Assembler::equal, L_cleanup);
7710 
7711   movdl(xmm0, crc);                        //get the initial crc value
7712 
7713   cmpl(len, 16);
7714   jcc(Assembler::equal, L_exact_16_left);
7715   jcc(Assembler::less, L_less_than_16_left);
7716 
7717   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7718   pxor(xmm7, xmm0);                       //xor the initial crc value
7719   addl(pos, 16);
7720   subl(len, 16);
7721   movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
7722   jmp(L_get_last_two_xmms);
7723 
7724   bind(L_less_than_16_left);
7725   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7726   pxor(xmm1, xmm1);
7727   movptr(tmp1, rsp);
7728   movdqu(Address(tmp1, 0 * 16), xmm1);
7729 
7730   cmpl(len, 4);
7731   jcc(Assembler::less, L_only_less_than_4);
7732 
7733   //backup the counter value
7734   movl(tmp2, len);
7735   cmpl(len, 8);
7736   jcc(Assembler::less, L_less_than_8_left);
7737 
7738   //load 8 Bytes
7739   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7740   movq(Address(tmp1, 0 * 16), rax);
7741   addptr(tmp1, 8);
7742   subl(len, 8);
7743   addl(pos, 8);
7744 
7745   bind(L_less_than_8_left);
7746   cmpl(len, 4);
7747   jcc(Assembler::less, L_less_than_4_left);
7748 
7749   //load 4 Bytes
7750   movl(rax, Address(buf, pos, Address::times_1, 0));
7751   movl(Address(tmp1, 0 * 16), rax);
7752   addptr(tmp1, 4);
7753   subl(len, 4);
7754   addl(pos, 4);
7755 
7756   bind(L_less_than_4_left);
7757   cmpl(len, 2);
7758   jcc(Assembler::less, L_less_than_2_left);
7759 
7760   // load 2 Bytes
7761   movw(rax, Address(buf, pos, Address::times_1, 0));
7762   movl(Address(tmp1, 0 * 16), rax);
7763   addptr(tmp1, 2);
7764   subl(len, 2);
7765   addl(pos, 2);
7766 
7767   bind(L_less_than_2_left);
7768   cmpl(len, 1);
7769   jcc(Assembler::less, L_zero_left);
7770 
7771   // load 1 Byte
7772   movb(rax, Address(buf, pos, Address::times_1, 0));
7773   movb(Address(tmp1, 0 * 16), rax);
7774 
7775   bind(L_zero_left);
7776   movdqu(xmm7, Address(rsp, 0));
7777   pxor(xmm7, xmm0);                       //xor the initial crc value
7778 
7779   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7780   movdqu(xmm0, Address(rax, tmp2));
7781   pshufb(xmm7, xmm0);
7782   jmp(L_128_done);
7783 
7784   bind(L_exact_16_left);
7785   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7786   pxor(xmm7, xmm0);                       //xor the initial crc value
7787   jmp(L_128_done);
7788 
7789   bind(L_only_less_than_4);
7790   cmpl(len, 3);
7791   jcc(Assembler::less, L_only_less_than_3);
7792 
7793   // load 3 Bytes
7794   movb(rax, Address(buf, pos, Address::times_1, 0));
7795   movb(Address(tmp1, 0), rax);
7796 
7797   movb(rax, Address(buf, pos, Address::times_1, 1));
7798   movb(Address(tmp1, 1), rax);
7799 
7800   movb(rax, Address(buf, pos, Address::times_1, 2));
7801   movb(Address(tmp1, 2), rax);
7802 
7803   movdqu(xmm7, Address(rsp, 0));
7804   pxor(xmm7, xmm0);                     //xor the initial crc value
7805 
7806   pslldq(xmm7, 0x5);
7807   jmp(L_barrett);
7808   bind(L_only_less_than_3);
7809   cmpl(len, 2);
7810   jcc(Assembler::less, L_only_less_than_2);
7811 
7812   // load 2 Bytes
7813   movb(rax, Address(buf, pos, Address::times_1, 0));
7814   movb(Address(tmp1, 0), rax);
7815 
7816   movb(rax, Address(buf, pos, Address::times_1, 1));
7817   movb(Address(tmp1, 1), rax);
7818 
7819   movdqu(xmm7, Address(rsp, 0));
7820   pxor(xmm7, xmm0);                     //xor the initial crc value
7821 
7822   pslldq(xmm7, 0x6);
7823   jmp(L_barrett);
7824 
7825   bind(L_only_less_than_2);
7826   //load 1 Byte
7827   movb(rax, Address(buf, pos, Address::times_1, 0));
7828   movb(Address(tmp1, 0), rax);
7829 
7830   movdqu(xmm7, Address(rsp, 0));
7831   pxor(xmm7, xmm0);                     //xor the initial crc value
7832 
7833   pslldq(xmm7, 0x7);
7834 }
7835 
7836 /**
7837 * Compute CRC32 using AVX512 instructions
7838 * param crc   register containing existing CRC (32-bit)
7839 * param buf   register pointing to input byte buffer (byte*)
7840 * param len   register containing number of bytes
7841 * param table address of crc or crc32c table
7842 * param tmp1  scratch register
7843 * param tmp2  scratch register
7844 * return rax  result register
7845 *
7846 * This routine is identical for crc32c with the exception of the precomputed constant
7847 * table which will be passed as the table argument.  The calculation steps are
7848 * the same for both variants.
7849 */
7850 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7851   assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7852 
7853   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7854   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7855   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7856   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7857   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7858 
7859   const Register pos = r12;
7860   push(r12);
7861   subptr(rsp, 16 * 2 + 8);
7862 
7863   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7864   // context for the registers used, where all instructions below are using 128-bit mode
7865   // On EVEX without VL and BW, these instructions will all be AVX.
7866   movl(pos, 0);
7867 
7868   // check if smaller than 256B
7869   cmpl(len, 256);
7870   jcc(Assembler::less, L_less_than_256);
7871 
7872   // load the initial crc value
7873   movdl(xmm10, crc);
7874 
7875   // receive the initial 64B data, xor the initial crc value
7876   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7877   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7878   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7879   evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7880 
7881   subl(len, 256);
7882   cmpl(len, 256);
7883   jcc(Assembler::less, L_fold_128_B_loop);
7884 
7885   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7886   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7887   evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7888   subl(len, 256);
7889 
7890   bind(L_fold_256_B_loop);
7891   addl(pos, 256);
7892   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7893   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7894   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7895   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7896 
7897   subl(len, 256);
7898   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7899 
7900   // Fold 256 into 128
7901   addl(pos, 256);
7902   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7903   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7904   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7905 
7906   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7907   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7908   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7909 
7910   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7911   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7912 
7913   addl(len, 128);
7914   jmp(L_fold_128_B_register);
7915 
7916   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7917   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7918 
7919   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7920   bind(L_fold_128_B_loop);
7921   addl(pos, 128);
7922   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7923   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7924 
7925   subl(len, 128);
7926   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7927 
7928   addl(pos, 128);
7929 
7930   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7931   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7932   bind(L_fold_128_B_register);
7933   evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7934   evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7935   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7936   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7937   // save last that has no multiplicand
7938   vextracti64x2(xmm7, xmm4, 3);
7939 
7940   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7941   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7942   // Needed later in reduction loop
7943   movdqu(xmm10, Address(table, 1 * 16));
7944   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7945   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7946 
7947   // Swap 1,0,3,2 - 01 00 11 10
7948   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7949   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7950   vextracti128(xmm5, xmm8, 1);
7951   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7952 
7953   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7954   // instead of a cmp instruction, we use the negative flag with the jl instruction
7955   addl(len, 128 - 16);
7956   jcc(Assembler::less, L_final_reduction_for_128);
7957 
7958   bind(L_16B_reduction_loop);
7959   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7960   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7961   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7962   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7963   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7964   addl(pos, 16);
7965   subl(len, 16);
7966   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7967 
7968   bind(L_final_reduction_for_128);
7969   addl(len, 16);
7970   jcc(Assembler::equal, L_128_done);
7971 
7972   bind(L_get_last_two_xmms);
7973   movdqu(xmm2, xmm7);
7974   addl(pos, len);
7975   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7976   subl(pos, len);
7977 
7978   // get rid of the extra data that was loaded before
7979   // load the shift constant
7980   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7981   movdqu(xmm0, Address(rax, len));
7982   addl(rax, len);
7983 
7984   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7985   //Change mask to 512
7986   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7987   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7988 
7989   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7990   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7991   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7992   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7993   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7994 
7995   bind(L_128_done);
7996   // compute crc of a 128-bit value
7997   movdqu(xmm10, Address(table, 3 * 16));
7998   movdqu(xmm0, xmm7);
7999 
8000   // 64b fold
8001   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
8002   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
8003   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8004 
8005   // 32b fold
8006   movdqu(xmm0, xmm7);
8007   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
8008   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8009   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8010   jmp(L_barrett);
8011 
8012   bind(L_less_than_256);
8013   kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
8014 
8015   //barrett reduction
8016   bind(L_barrett);
8017   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
8018   movdqu(xmm1, xmm7);
8019   movdqu(xmm2, xmm7);
8020   movdqu(xmm10, Address(table, 4 * 16));
8021 
8022   pclmulqdq(xmm7, xmm10, 0x0);
8023   pxor(xmm7, xmm2);
8024   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
8025   movdqu(xmm2, xmm7);
8026   pclmulqdq(xmm7, xmm10, 0x10);
8027   pxor(xmm7, xmm2);
8028   pxor(xmm7, xmm1);
8029   pextrd(crc, xmm7, 2);
8030 
8031   bind(L_cleanup);
8032   addptr(rsp, 16 * 2 + 8);
8033   pop(r12);
8034 }
8035 
8036 // S. Gueron / Information Processing Letters 112 (2012) 184
8037 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8038 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8039 // Output: the 64-bit carry-less product of B * CONST
8040 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8041                                      Register tmp1, Register tmp2, Register tmp3) {
8042   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8043   if (n > 0) {
8044     addq(tmp3, n * 256 * 8);
8045   }
8046   //    Q1 = TABLEExt[n][B & 0xFF];
8047   movl(tmp1, in);
8048   andl(tmp1, 0x000000FF);
8049   shll(tmp1, 3);
8050   addq(tmp1, tmp3);
8051   movq(tmp1, Address(tmp1, 0));
8052 
8053   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
8054   movl(tmp2, in);
8055   shrl(tmp2, 8);
8056   andl(tmp2, 0x000000FF);
8057   shll(tmp2, 3);
8058   addq(tmp2, tmp3);
8059   movq(tmp2, Address(tmp2, 0));
8060 
8061   shlq(tmp2, 8);
8062   xorq(tmp1, tmp2);
8063 
8064   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
8065   movl(tmp2, in);
8066   shrl(tmp2, 16);
8067   andl(tmp2, 0x000000FF);
8068   shll(tmp2, 3);
8069   addq(tmp2, tmp3);
8070   movq(tmp2, Address(tmp2, 0));
8071 
8072   shlq(tmp2, 16);
8073   xorq(tmp1, tmp2);
8074 
8075   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
8076   shrl(in, 24);
8077   andl(in, 0x000000FF);
8078   shll(in, 3);
8079   addq(in, tmp3);
8080   movq(in, Address(in, 0));
8081 
8082   shlq(in, 24);
8083   xorq(in, tmp1);
8084   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8085 }
8086 
8087 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8088                                       Register in_out,
8089                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8090                                       XMMRegister w_xtmp2,
8091                                       Register tmp1,
8092                                       Register n_tmp2, Register n_tmp3) {
8093   if (is_pclmulqdq_supported) {
8094     movdl(w_xtmp1, in_out); // modified blindly
8095 
8096     movl(tmp1, const_or_pre_comp_const_index);
8097     movdl(w_xtmp2, tmp1);
8098     pclmulqdq(w_xtmp1, w_xtmp2, 0);
8099 
8100     movdq(in_out, w_xtmp1);
8101   } else {
8102     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8103   }
8104 }
8105 
8106 // Recombination Alternative 2: No bit-reflections
8107 // T1 = (CRC_A * U1) << 1
8108 // T2 = (CRC_B * U2) << 1
8109 // C1 = T1 >> 32
8110 // C2 = T2 >> 32
8111 // T1 = T1 & 0xFFFFFFFF
8112 // T2 = T2 & 0xFFFFFFFF
8113 // T1 = CRC32(0, T1)
8114 // T2 = CRC32(0, T2)
8115 // C1 = C1 ^ T1
8116 // C2 = C2 ^ T2
8117 // CRC = C1 ^ C2 ^ CRC_C
8118 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8119                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8120                                      Register tmp1, Register tmp2,
8121                                      Register n_tmp3) {
8122   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8123   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8124   shlq(in_out, 1);
8125   movl(tmp1, in_out);
8126   shrq(in_out, 32);
8127   xorl(tmp2, tmp2);
8128   crc32(tmp2, tmp1, 4);
8129   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8130   shlq(in1, 1);
8131   movl(tmp1, in1);
8132   shrq(in1, 32);
8133   xorl(tmp2, tmp2);
8134   crc32(tmp2, tmp1, 4);
8135   xorl(in1, tmp2);
8136   xorl(in_out, in1);
8137   xorl(in_out, in2);
8138 }
8139 
8140 // Set N to predefined value
8141 // Subtract from a length of a buffer
8142 // execute in a loop:
8143 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8144 // for i = 1 to N do
8145 //  CRC_A = CRC32(CRC_A, A[i])
8146 //  CRC_B = CRC32(CRC_B, B[i])
8147 //  CRC_C = CRC32(CRC_C, C[i])
8148 // end for
8149 // Recombine
8150 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8151                                        Register in_out1, Register in_out2, Register in_out3,
8152                                        Register tmp1, Register tmp2, Register tmp3,
8153                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8154                                        Register tmp4, Register tmp5,
8155                                        Register n_tmp6) {
8156   Label L_processPartitions;
8157   Label L_processPartition;
8158   Label L_exit;
8159 
8160   bind(L_processPartitions);
8161   cmpl(in_out1, 3 * size);
8162   jcc(Assembler::less, L_exit);
8163     xorl(tmp1, tmp1);
8164     xorl(tmp2, tmp2);
8165     movq(tmp3, in_out2);
8166     addq(tmp3, size);
8167 
8168     bind(L_processPartition);
8169       crc32(in_out3, Address(in_out2, 0), 8);
8170       crc32(tmp1, Address(in_out2, size), 8);
8171       crc32(tmp2, Address(in_out2, size * 2), 8);
8172       addq(in_out2, 8);
8173       cmpq(in_out2, tmp3);
8174       jcc(Assembler::less, L_processPartition);
8175     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8176             w_xtmp1, w_xtmp2, w_xtmp3,
8177             tmp4, tmp5,
8178             n_tmp6);
8179     addq(in_out2, 2 * size);
8180     subl(in_out1, 3 * size);
8181     jmp(L_processPartitions);
8182 
8183   bind(L_exit);
8184 }
8185 #else
8186 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
8187                                      Register tmp1, Register tmp2, Register tmp3,
8188                                      XMMRegister xtmp1, XMMRegister xtmp2) {
8189   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8190   if (n > 0) {
8191     addl(tmp3, n * 256 * 8);
8192   }
8193   //    Q1 = TABLEExt[n][B & 0xFF];
8194   movl(tmp1, in_out);
8195   andl(tmp1, 0x000000FF);
8196   shll(tmp1, 3);
8197   addl(tmp1, tmp3);
8198   movq(xtmp1, Address(tmp1, 0));
8199 
8200   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
8201   movl(tmp2, in_out);
8202   shrl(tmp2, 8);
8203   andl(tmp2, 0x000000FF);
8204   shll(tmp2, 3);
8205   addl(tmp2, tmp3);
8206   movq(xtmp2, Address(tmp2, 0));
8207 
8208   psllq(xtmp2, 8);
8209   pxor(xtmp1, xtmp2);
8210 
8211   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
8212   movl(tmp2, in_out);
8213   shrl(tmp2, 16);
8214   andl(tmp2, 0x000000FF);
8215   shll(tmp2, 3);
8216   addl(tmp2, tmp3);
8217   movq(xtmp2, Address(tmp2, 0));
8218 
8219   psllq(xtmp2, 16);
8220   pxor(xtmp1, xtmp2);
8221 
8222   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
8223   shrl(in_out, 24);
8224   andl(in_out, 0x000000FF);
8225   shll(in_out, 3);
8226   addl(in_out, tmp3);
8227   movq(xtmp2, Address(in_out, 0));
8228 
8229   psllq(xtmp2, 24);
8230   pxor(xtmp1, xtmp2); // Result in CXMM
8231   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8232 }
8233 
8234 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8235                                       Register in_out,
8236                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8237                                       XMMRegister w_xtmp2,
8238                                       Register tmp1,
8239                                       Register n_tmp2, Register n_tmp3) {
8240   if (is_pclmulqdq_supported) {
8241     movdl(w_xtmp1, in_out);
8242 
8243     movl(tmp1, const_or_pre_comp_const_index);
8244     movdl(w_xtmp2, tmp1);
8245     pclmulqdq(w_xtmp1, w_xtmp2, 0);
8246     // Keep result in XMM since GPR is 32 bit in length
8247   } else {
8248     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
8249   }
8250 }
8251 
8252 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8253                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8254                                      Register tmp1, Register tmp2,
8255                                      Register n_tmp3) {
8256   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8257   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8258 
8259   psllq(w_xtmp1, 1);
8260   movdl(tmp1, w_xtmp1);
8261   psrlq(w_xtmp1, 32);
8262   movdl(in_out, w_xtmp1);
8263 
8264   xorl(tmp2, tmp2);
8265   crc32(tmp2, tmp1, 4);
8266   xorl(in_out, tmp2);
8267 
8268   psllq(w_xtmp2, 1);
8269   movdl(tmp1, w_xtmp2);
8270   psrlq(w_xtmp2, 32);
8271   movdl(in1, w_xtmp2);
8272 
8273   xorl(tmp2, tmp2);
8274   crc32(tmp2, tmp1, 4);
8275   xorl(in1, tmp2);
8276   xorl(in_out, in1);
8277   xorl(in_out, in2);
8278 }
8279 
8280 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8281                                        Register in_out1, Register in_out2, Register in_out3,
8282                                        Register tmp1, Register tmp2, Register tmp3,
8283                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8284                                        Register tmp4, Register tmp5,
8285                                        Register n_tmp6) {
8286   Label L_processPartitions;
8287   Label L_processPartition;
8288   Label L_exit;
8289 
8290   bind(L_processPartitions);
8291   cmpl(in_out1, 3 * size);
8292   jcc(Assembler::less, L_exit);
8293     xorl(tmp1, tmp1);
8294     xorl(tmp2, tmp2);
8295     movl(tmp3, in_out2);
8296     addl(tmp3, size);
8297 
8298     bind(L_processPartition);
8299       crc32(in_out3, Address(in_out2, 0), 4);
8300       crc32(tmp1, Address(in_out2, size), 4);
8301       crc32(tmp2, Address(in_out2, size*2), 4);
8302       crc32(in_out3, Address(in_out2, 0+4), 4);
8303       crc32(tmp1, Address(in_out2, size+4), 4);
8304       crc32(tmp2, Address(in_out2, size*2+4), 4);
8305       addl(in_out2, 8);
8306       cmpl(in_out2, tmp3);
8307       jcc(Assembler::less, L_processPartition);
8308 
8309         push(tmp3);
8310         push(in_out1);
8311         push(in_out2);
8312         tmp4 = tmp3;
8313         tmp5 = in_out1;
8314         n_tmp6 = in_out2;
8315 
8316       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8317             w_xtmp1, w_xtmp2, w_xtmp3,
8318             tmp4, tmp5,
8319             n_tmp6);
8320 
8321         pop(in_out2);
8322         pop(in_out1);
8323         pop(tmp3);
8324 
8325     addl(in_out2, 2 * size);
8326     subl(in_out1, 3 * size);
8327     jmp(L_processPartitions);
8328 
8329   bind(L_exit);
8330 }
8331 #endif //LP64
8332 
8333 #ifdef _LP64
8334 // Algorithm 2: Pipelined usage of the CRC32 instruction.
8335 // Input: A buffer I of L bytes.
8336 // Output: the CRC32C value of the buffer.
8337 // Notations:
8338 // Write L = 24N + r, with N = floor (L/24).
8339 // r = L mod 24 (0 <= r < 24).
8340 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8341 // N quadwords, and R consists of r bytes.
8342 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8343 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8344 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8345 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8346 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8347                                           Register tmp1, Register tmp2, Register tmp3,
8348                                           Register tmp4, Register tmp5, Register tmp6,
8349                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8350                                           bool is_pclmulqdq_supported) {
8351   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8352   Label L_wordByWord;
8353   Label L_byteByByteProlog;
8354   Label L_byteByByte;
8355   Label L_exit;
8356 
8357   if (is_pclmulqdq_supported ) {
8358     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8359     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
8360 
8361     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8362     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8363 
8364     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8365     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8366     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8367   } else {
8368     const_or_pre_comp_const_index[0] = 1;
8369     const_or_pre_comp_const_index[1] = 0;
8370 
8371     const_or_pre_comp_const_index[2] = 3;
8372     const_or_pre_comp_const_index[3] = 2;
8373 
8374     const_or_pre_comp_const_index[4] = 5;
8375     const_or_pre_comp_const_index[5] = 4;
8376    }
8377   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8378                     in2, in1, in_out,
8379                     tmp1, tmp2, tmp3,
8380                     w_xtmp1, w_xtmp2, w_xtmp3,
8381                     tmp4, tmp5,
8382                     tmp6);
8383   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8384                     in2, in1, in_out,
8385                     tmp1, tmp2, tmp3,
8386                     w_xtmp1, w_xtmp2, w_xtmp3,
8387                     tmp4, tmp5,
8388                     tmp6);
8389   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8390                     in2, in1, in_out,
8391                     tmp1, tmp2, tmp3,
8392                     w_xtmp1, w_xtmp2, w_xtmp3,
8393                     tmp4, tmp5,
8394                     tmp6);
8395   movl(tmp1, in2);
8396   andl(tmp1, 0x00000007);
8397   negl(tmp1);
8398   addl(tmp1, in2);
8399   addq(tmp1, in1);
8400 
8401   cmpq(in1, tmp1);
8402   jccb(Assembler::greaterEqual, L_byteByByteProlog);
8403   align(16);
8404   BIND(L_wordByWord);
8405     crc32(in_out, Address(in1, 0), 8);
8406     addq(in1, 8);
8407     cmpq(in1, tmp1);
8408     jcc(Assembler::less, L_wordByWord);
8409 
8410   BIND(L_byteByByteProlog);
8411   andl(in2, 0x00000007);
8412   movl(tmp2, 1);
8413 
8414   cmpl(tmp2, in2);
8415   jccb(Assembler::greater, L_exit);
8416   BIND(L_byteByByte);
8417     crc32(in_out, Address(in1, 0), 1);
8418     incq(in1);
8419     incl(tmp2);
8420     cmpl(tmp2, in2);
8421     jcc(Assembler::lessEqual, L_byteByByte);
8422 
8423   BIND(L_exit);
8424 }
8425 #else
8426 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8427                                           Register tmp1, Register  tmp2, Register tmp3,
8428                                           Register tmp4, Register  tmp5, Register tmp6,
8429                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8430                                           bool is_pclmulqdq_supported) {
8431   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8432   Label L_wordByWord;
8433   Label L_byteByByteProlog;
8434   Label L_byteByByte;
8435   Label L_exit;
8436 
8437   if (is_pclmulqdq_supported) {
8438     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8439     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
8440 
8441     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8442     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8443 
8444     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8445     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8446   } else {
8447     const_or_pre_comp_const_index[0] = 1;
8448     const_or_pre_comp_const_index[1] = 0;
8449 
8450     const_or_pre_comp_const_index[2] = 3;
8451     const_or_pre_comp_const_index[3] = 2;
8452 
8453     const_or_pre_comp_const_index[4] = 5;
8454     const_or_pre_comp_const_index[5] = 4;
8455   }
8456   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8457                     in2, in1, in_out,
8458                     tmp1, tmp2, tmp3,
8459                     w_xtmp1, w_xtmp2, w_xtmp3,
8460                     tmp4, tmp5,
8461                     tmp6);
8462   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8463                     in2, in1, in_out,
8464                     tmp1, tmp2, tmp3,
8465                     w_xtmp1, w_xtmp2, w_xtmp3,
8466                     tmp4, tmp5,
8467                     tmp6);
8468   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8469                     in2, in1, in_out,
8470                     tmp1, tmp2, tmp3,
8471                     w_xtmp1, w_xtmp2, w_xtmp3,
8472                     tmp4, tmp5,
8473                     tmp6);
8474   movl(tmp1, in2);
8475   andl(tmp1, 0x00000007);
8476   negl(tmp1);
8477   addl(tmp1, in2);
8478   addl(tmp1, in1);
8479 
8480   BIND(L_wordByWord);
8481   cmpl(in1, tmp1);
8482   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8483     crc32(in_out, Address(in1,0), 4);
8484     addl(in1, 4);
8485     jmp(L_wordByWord);
8486 
8487   BIND(L_byteByByteProlog);
8488   andl(in2, 0x00000007);
8489   movl(tmp2, 1);
8490 
8491   BIND(L_byteByByte);
8492   cmpl(tmp2, in2);
8493   jccb(Assembler::greater, L_exit);
8494     movb(tmp1, Address(in1, 0));
8495     crc32(in_out, tmp1, 1);
8496     incl(in1);
8497     incl(tmp2);
8498     jmp(L_byteByByte);
8499 
8500   BIND(L_exit);
8501 }
8502 #endif // LP64
8503 #undef BIND
8504 #undef BLOCK_COMMENT
8505 
8506 // Compress char[] array to byte[].
8507 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8508 //   @IntrinsicCandidate
8509 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8510 //     for (int i = 0; i < len; i++) {
8511 //       int c = src[srcOff++];
8512 //       if (c >>> 8 != 0) {
8513 //         return 0;
8514 //       }
8515 //       dst[dstOff++] = (byte)c;
8516 //     }
8517 //     return len;
8518 //   }
8519 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8520   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8521   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8522   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8523   Label copy_chars_loop, return_length, return_zero, done;
8524 
8525   // rsi: src
8526   // rdi: dst
8527   // rdx: len
8528   // rcx: tmp5
8529   // rax: result
8530 
8531   // rsi holds start addr of source char[] to be compressed
8532   // rdi holds start addr of destination byte[]
8533   // rdx holds length
8534 
8535   assert(len != result, "");
8536 
8537   // save length for return
8538   push(len);
8539 
8540   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8541     VM_Version::supports_avx512vlbw() &&
8542     VM_Version::supports_bmi2()) {
8543 
8544     Label copy_32_loop, copy_loop_tail, below_threshold;
8545 
8546     // alignment
8547     Label post_alignment;
8548 
8549     // if length of the string is less than 16, handle it in an old fashioned way
8550     testl(len, -32);
8551     jcc(Assembler::zero, below_threshold);
8552 
8553     // First check whether a character is compressible ( <= 0xFF).
8554     // Create mask to test for Unicode chars inside zmm vector
8555     movl(result, 0x00FF);
8556     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8557 
8558     testl(len, -64);
8559     jcc(Assembler::zero, post_alignment);
8560 
8561     movl(tmp5, dst);
8562     andl(tmp5, (32 - 1));
8563     negl(tmp5);
8564     andl(tmp5, (32 - 1));
8565 
8566     // bail out when there is nothing to be done
8567     testl(tmp5, 0xFFFFFFFF);
8568     jcc(Assembler::zero, post_alignment);
8569 
8570     // ~(~0 << len), where len is the # of remaining elements to process
8571     movl(result, 0xFFFFFFFF);
8572     shlxl(result, result, tmp5);
8573     notl(result);
8574     kmovdl(mask2, result);
8575 
8576     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8577     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8578     ktestd(mask1, mask2);
8579     jcc(Assembler::carryClear, return_zero);
8580 
8581     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8582 
8583     addptr(src, tmp5);
8584     addptr(src, tmp5);
8585     addptr(dst, tmp5);
8586     subl(len, tmp5);
8587 
8588     bind(post_alignment);
8589     // end of alignment
8590 
8591     movl(tmp5, len);
8592     andl(tmp5, (32 - 1));    // tail count (in chars)
8593     andl(len, ~(32 - 1));    // vector count (in chars)
8594     jcc(Assembler::zero, copy_loop_tail);
8595 
8596     lea(src, Address(src, len, Address::times_2));
8597     lea(dst, Address(dst, len, Address::times_1));
8598     negptr(len);
8599 
8600     bind(copy_32_loop);
8601     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
8602     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8603     kortestdl(mask1, mask1);
8604     jcc(Assembler::carryClear, return_zero);
8605 
8606     // All elements in current processed chunk are valid candidates for
8607     // compression. Write a truncated byte elements to the memory.
8608     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8609     addptr(len, 32);
8610     jcc(Assembler::notZero, copy_32_loop);
8611 
8612     bind(copy_loop_tail);
8613     // bail out when there is nothing to be done
8614     testl(tmp5, 0xFFFFFFFF);
8615     jcc(Assembler::zero, return_length);
8616 
8617     movl(len, tmp5);
8618 
8619     // ~(~0 << len), where len is the # of remaining elements to process
8620     movl(result, 0xFFFFFFFF);
8621     shlxl(result, result, len);
8622     notl(result);
8623 
8624     kmovdl(mask2, result);
8625 
8626     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8627     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8628     ktestd(mask1, mask2);
8629     jcc(Assembler::carryClear, return_zero);
8630 
8631     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8632     jmp(return_length);
8633 
8634     bind(below_threshold);
8635   }
8636 
8637   if (UseSSE42Intrinsics) {
8638     Label copy_32_loop, copy_16, copy_tail;
8639 
8640     movl(result, len);
8641 
8642     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8643 
8644     // vectored compression
8645     andl(len, 0xfffffff0);    // vector count (in chars)
8646     andl(result, 0x0000000f);    // tail count (in chars)
8647     testl(len, len);
8648     jcc(Assembler::zero, copy_16);
8649 
8650     // compress 16 chars per iter
8651     movdl(tmp1Reg, tmp5);
8652     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8653     pxor(tmp4Reg, tmp4Reg);
8654 
8655     lea(src, Address(src, len, Address::times_2));
8656     lea(dst, Address(dst, len, Address::times_1));
8657     negptr(len);
8658 
8659     bind(copy_32_loop);
8660     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8661     por(tmp4Reg, tmp2Reg);
8662     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8663     por(tmp4Reg, tmp3Reg);
8664     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8665     jcc(Assembler::notZero, return_zero);
8666     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8667     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8668     addptr(len, 16);
8669     jcc(Assembler::notZero, copy_32_loop);
8670 
8671     // compress next vector of 8 chars (if any)
8672     bind(copy_16);
8673     movl(len, result);
8674     andl(len, 0xfffffff8);    // vector count (in chars)
8675     andl(result, 0x00000007);    // tail count (in chars)
8676     testl(len, len);
8677     jccb(Assembler::zero, copy_tail);
8678 
8679     movdl(tmp1Reg, tmp5);
8680     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8681     pxor(tmp3Reg, tmp3Reg);
8682 
8683     movdqu(tmp2Reg, Address(src, 0));
8684     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8685     jccb(Assembler::notZero, return_zero);
8686     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8687     movq(Address(dst, 0), tmp2Reg);
8688     addptr(src, 16);
8689     addptr(dst, 8);
8690 
8691     bind(copy_tail);
8692     movl(len, result);
8693   }
8694   // compress 1 char per iter
8695   testl(len, len);
8696   jccb(Assembler::zero, return_length);
8697   lea(src, Address(src, len, Address::times_2));
8698   lea(dst, Address(dst, len, Address::times_1));
8699   negptr(len);
8700 
8701   bind(copy_chars_loop);
8702   load_unsigned_short(result, Address(src, len, Address::times_2));
8703   testl(result, 0xff00);      // check if Unicode char
8704   jccb(Assembler::notZero, return_zero);
8705   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8706   increment(len);
8707   jcc(Assembler::notZero, copy_chars_loop);
8708 
8709   // if compression succeeded, return length
8710   bind(return_length);
8711   pop(result);
8712   jmpb(done);
8713 
8714   // if compression failed, return 0
8715   bind(return_zero);
8716   xorl(result, result);
8717   addptr(rsp, wordSize);
8718 
8719   bind(done);
8720 }
8721 
8722 // Inflate byte[] array to char[].
8723 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8724 //   @IntrinsicCandidate
8725 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8726 //     for (int i = 0; i < len; i++) {
8727 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8728 //     }
8729 //   }
8730 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8731   XMMRegister tmp1, Register tmp2, KRegister mask) {
8732   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8733   // rsi: src
8734   // rdi: dst
8735   // rdx: len
8736   // rcx: tmp2
8737 
8738   // rsi holds start addr of source byte[] to be inflated
8739   // rdi holds start addr of destination char[]
8740   // rdx holds length
8741   assert_different_registers(src, dst, len, tmp2);
8742   movl(tmp2, len);
8743   if ((UseAVX > 2) && // AVX512
8744     VM_Version::supports_avx512vlbw() &&
8745     VM_Version::supports_bmi2()) {
8746 
8747     Label copy_32_loop, copy_tail;
8748     Register tmp3_aliased = len;
8749 
8750     // if length of the string is less than 16, handle it in an old fashioned way
8751     testl(len, -16);
8752     jcc(Assembler::zero, below_threshold);
8753 
8754     testl(len, -1 * AVX3Threshold);
8755     jcc(Assembler::zero, avx3_threshold);
8756 
8757     // In order to use only one arithmetic operation for the main loop we use
8758     // this pre-calculation
8759     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8760     andl(len, -32);     // vector count
8761     jccb(Assembler::zero, copy_tail);
8762 
8763     lea(src, Address(src, len, Address::times_1));
8764     lea(dst, Address(dst, len, Address::times_2));
8765     negptr(len);
8766 
8767 
8768     // inflate 32 chars per iter
8769     bind(copy_32_loop);
8770     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8771     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
8772     addptr(len, 32);
8773     jcc(Assembler::notZero, copy_32_loop);
8774 
8775     bind(copy_tail);
8776     // bail out when there is nothing to be done
8777     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8778     jcc(Assembler::zero, done);
8779 
8780     // ~(~0 << length), where length is the # of remaining elements to process
8781     movl(tmp3_aliased, -1);
8782     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8783     notl(tmp3_aliased);
8784     kmovdl(mask, tmp3_aliased);
8785     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8786     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8787 
8788     jmp(done);
8789     bind(avx3_threshold);
8790   }
8791   if (UseSSE42Intrinsics) {
8792     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8793 
8794     if (UseAVX > 1) {
8795       andl(tmp2, (16 - 1));
8796       andl(len, -16);
8797       jccb(Assembler::zero, copy_new_tail);
8798     } else {
8799       andl(tmp2, 0x00000007);   // tail count (in chars)
8800       andl(len, 0xfffffff8);    // vector count (in chars)
8801       jccb(Assembler::zero, copy_tail);
8802     }
8803 
8804     // vectored inflation
8805     lea(src, Address(src, len, Address::times_1));
8806     lea(dst, Address(dst, len, Address::times_2));
8807     negptr(len);
8808 
8809     if (UseAVX > 1) {
8810       bind(copy_16_loop);
8811       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8812       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8813       addptr(len, 16);
8814       jcc(Assembler::notZero, copy_16_loop);
8815 
8816       bind(below_threshold);
8817       bind(copy_new_tail);
8818       movl(len, tmp2);
8819       andl(tmp2, 0x00000007);
8820       andl(len, 0xFFFFFFF8);
8821       jccb(Assembler::zero, copy_tail);
8822 
8823       pmovzxbw(tmp1, Address(src, 0));
8824       movdqu(Address(dst, 0), tmp1);
8825       addptr(src, 8);
8826       addptr(dst, 2 * 8);
8827 
8828       jmp(copy_tail, true);
8829     }
8830 
8831     // inflate 8 chars per iter
8832     bind(copy_8_loop);
8833     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8834     movdqu(Address(dst, len, Address::times_2), tmp1);
8835     addptr(len, 8);
8836     jcc(Assembler::notZero, copy_8_loop);
8837 
8838     bind(copy_tail);
8839     movl(len, tmp2);
8840 
8841     cmpl(len, 4);
8842     jccb(Assembler::less, copy_bytes);
8843 
8844     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8845     pmovzxbw(tmp1, tmp1);
8846     movq(Address(dst, 0), tmp1);
8847     subptr(len, 4);
8848     addptr(src, 4);
8849     addptr(dst, 8);
8850 
8851     bind(copy_bytes);
8852   } else {
8853     bind(below_threshold);
8854   }
8855 
8856   testl(len, len);
8857   jccb(Assembler::zero, done);
8858   lea(src, Address(src, len, Address::times_1));
8859   lea(dst, Address(dst, len, Address::times_2));
8860   negptr(len);
8861 
8862   // inflate 1 char per iter
8863   bind(copy_chars_loop);
8864   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8865   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8866   increment(len);
8867   jcc(Assembler::notZero, copy_chars_loop);
8868 
8869   bind(done);
8870 }
8871 
8872 
8873 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
8874   switch(type) {
8875     case T_BYTE:
8876     case T_BOOLEAN:
8877       evmovdqub(dst, kmask, src, merge, vector_len);
8878       break;
8879     case T_CHAR:
8880     case T_SHORT:
8881       evmovdquw(dst, kmask, src, merge, vector_len);
8882       break;
8883     case T_INT:
8884     case T_FLOAT:
8885       evmovdqul(dst, kmask, src, merge, vector_len);
8886       break;
8887     case T_LONG:
8888     case T_DOUBLE:
8889       evmovdquq(dst, kmask, src, merge, vector_len);
8890       break;
8891     default:
8892       fatal("Unexpected type argument %s", type2name(type));
8893       break;
8894   }
8895 }
8896 
8897 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
8898   switch(type) {
8899     case T_BYTE:
8900     case T_BOOLEAN:
8901       evmovdqub(dst, kmask, src, merge, vector_len);
8902       break;
8903     case T_CHAR:
8904     case T_SHORT:
8905       evmovdquw(dst, kmask, src, merge, vector_len);
8906       break;
8907     case T_INT:
8908     case T_FLOAT:
8909       evmovdqul(dst, kmask, src, merge, vector_len);
8910       break;
8911     case T_LONG:
8912     case T_DOUBLE:
8913       evmovdquq(dst, kmask, src, merge, vector_len);
8914       break;
8915     default:
8916       fatal("Unexpected type argument %s", type2name(type));
8917       break;
8918   }
8919 }
8920 
8921 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8922   switch(masklen) {
8923     case 2:
8924        knotbl(dst, src);
8925        movl(rtmp, 3);
8926        kmovbl(ktmp, rtmp);
8927        kandbl(dst, ktmp, dst);
8928        break;
8929     case 4:
8930        knotbl(dst, src);
8931        movl(rtmp, 15);
8932        kmovbl(ktmp, rtmp);
8933        kandbl(dst, ktmp, dst);
8934        break;
8935     case 8:
8936        knotbl(dst, src);
8937        break;
8938     case 16:
8939        knotwl(dst, src);
8940        break;
8941     case 32:
8942        knotdl(dst, src);
8943        break;
8944     case 64:
8945        knotql(dst, src);
8946        break;
8947     default:
8948       fatal("Unexpected vector length %d", masklen);
8949       break;
8950   }
8951 }
8952 
8953 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8954   switch(type) {
8955     case T_BOOLEAN:
8956     case T_BYTE:
8957        kandbl(dst, src1, src2);
8958        break;
8959     case T_CHAR:
8960     case T_SHORT:
8961        kandwl(dst, src1, src2);
8962        break;
8963     case T_INT:
8964     case T_FLOAT:
8965        kanddl(dst, src1, src2);
8966        break;
8967     case T_LONG:
8968     case T_DOUBLE:
8969        kandql(dst, src1, src2);
8970        break;
8971     default:
8972       fatal("Unexpected type argument %s", type2name(type));
8973       break;
8974   }
8975 }
8976 
8977 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8978   switch(type) {
8979     case T_BOOLEAN:
8980     case T_BYTE:
8981        korbl(dst, src1, src2);
8982        break;
8983     case T_CHAR:
8984     case T_SHORT:
8985        korwl(dst, src1, src2);
8986        break;
8987     case T_INT:
8988     case T_FLOAT:
8989        kordl(dst, src1, src2);
8990        break;
8991     case T_LONG:
8992     case T_DOUBLE:
8993        korql(dst, src1, src2);
8994        break;
8995     default:
8996       fatal("Unexpected type argument %s", type2name(type));
8997       break;
8998   }
8999 }
9000 
9001 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9002   switch(type) {
9003     case T_BOOLEAN:
9004     case T_BYTE:
9005        kxorbl(dst, src1, src2);
9006        break;
9007     case T_CHAR:
9008     case T_SHORT:
9009        kxorwl(dst, src1, src2);
9010        break;
9011     case T_INT:
9012     case T_FLOAT:
9013        kxordl(dst, src1, src2);
9014        break;
9015     case T_LONG:
9016     case T_DOUBLE:
9017        kxorql(dst, src1, src2);
9018        break;
9019     default:
9020       fatal("Unexpected type argument %s", type2name(type));
9021       break;
9022   }
9023 }
9024 
9025 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9026   switch(type) {
9027     case T_BOOLEAN:
9028     case T_BYTE:
9029       evpermb(dst, mask, nds, src, merge, vector_len); break;
9030     case T_CHAR:
9031     case T_SHORT:
9032       evpermw(dst, mask, nds, src, merge, vector_len); break;
9033     case T_INT:
9034     case T_FLOAT:
9035       evpermd(dst, mask, nds, src, merge, vector_len); break;
9036     case T_LONG:
9037     case T_DOUBLE:
9038       evpermq(dst, mask, nds, src, merge, vector_len); break;
9039     default:
9040       fatal("Unexpected type argument %s", type2name(type)); break;
9041   }
9042 }
9043 
9044 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9045   switch(type) {
9046     case T_BOOLEAN:
9047     case T_BYTE:
9048       evpermb(dst, mask, nds, src, merge, vector_len); break;
9049     case T_CHAR:
9050     case T_SHORT:
9051       evpermw(dst, mask, nds, src, merge, vector_len); break;
9052     case T_INT:
9053     case T_FLOAT:
9054       evpermd(dst, mask, nds, src, merge, vector_len); break;
9055     case T_LONG:
9056     case T_DOUBLE:
9057       evpermq(dst, mask, nds, src, merge, vector_len); break;
9058     default:
9059       fatal("Unexpected type argument %s", type2name(type)); break;
9060   }
9061 }
9062 
9063 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9064   switch(type) {
9065     case T_BYTE:
9066       evpminsb(dst, mask, nds, src, merge, vector_len); break;
9067     case T_SHORT:
9068       evpminsw(dst, mask, nds, src, merge, vector_len); break;
9069     case T_INT:
9070       evpminsd(dst, mask, nds, src, merge, vector_len); break;
9071     case T_LONG:
9072       evpminsq(dst, mask, nds, src, merge, vector_len); break;
9073     default:
9074       fatal("Unexpected type argument %s", type2name(type)); break;
9075   }
9076 }
9077 
9078 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9079   switch(type) {
9080     case T_BYTE:
9081       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9082     case T_SHORT:
9083       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9084     case T_INT:
9085       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9086     case T_LONG:
9087       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9088     default:
9089       fatal("Unexpected type argument %s", type2name(type)); break;
9090   }
9091 }
9092 
9093 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9094   switch(type) {
9095     case T_BYTE:
9096       evpminsb(dst, mask, nds, src, merge, vector_len); break;
9097     case T_SHORT:
9098       evpminsw(dst, mask, nds, src, merge, vector_len); break;
9099     case T_INT:
9100       evpminsd(dst, mask, nds, src, merge, vector_len); break;
9101     case T_LONG:
9102       evpminsq(dst, mask, nds, src, merge, vector_len); break;
9103     default:
9104       fatal("Unexpected type argument %s", type2name(type)); break;
9105   }
9106 }
9107 
9108 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9109   switch(type) {
9110     case T_BYTE:
9111       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9112     case T_SHORT:
9113       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9114     case T_INT:
9115       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9116     case T_LONG:
9117       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9118     default:
9119       fatal("Unexpected type argument %s", type2name(type)); break;
9120   }
9121 }
9122 
9123 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9124   switch(type) {
9125     case T_INT:
9126       evpxord(dst, mask, nds, src, merge, vector_len); break;
9127     case T_LONG:
9128       evpxorq(dst, mask, nds, src, merge, vector_len); break;
9129     default:
9130       fatal("Unexpected type argument %s", type2name(type)); break;
9131   }
9132 }
9133 
9134 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9135   switch(type) {
9136     case T_INT:
9137       evpxord(dst, mask, nds, src, merge, vector_len); break;
9138     case T_LONG:
9139       evpxorq(dst, mask, nds, src, merge, vector_len); break;
9140     default:
9141       fatal("Unexpected type argument %s", type2name(type)); break;
9142   }
9143 }
9144 
9145 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9146   switch(type) {
9147     case T_INT:
9148       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9149     case T_LONG:
9150       evporq(dst, mask, nds, src, merge, vector_len); break;
9151     default:
9152       fatal("Unexpected type argument %s", type2name(type)); break;
9153   }
9154 }
9155 
9156 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9157   switch(type) {
9158     case T_INT:
9159       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9160     case T_LONG:
9161       evporq(dst, mask, nds, src, merge, vector_len); break;
9162     default:
9163       fatal("Unexpected type argument %s", type2name(type)); break;
9164   }
9165 }
9166 
9167 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9168   switch(type) {
9169     case T_INT:
9170       evpandd(dst, mask, nds, src, merge, vector_len); break;
9171     case T_LONG:
9172       evpandq(dst, mask, nds, src, merge, vector_len); break;
9173     default:
9174       fatal("Unexpected type argument %s", type2name(type)); break;
9175   }
9176 }
9177 
9178 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9179   switch(type) {
9180     case T_INT:
9181       evpandd(dst, mask, nds, src, merge, vector_len); break;
9182     case T_LONG:
9183       evpandq(dst, mask, nds, src, merge, vector_len); break;
9184     default:
9185       fatal("Unexpected type argument %s", type2name(type)); break;
9186   }
9187 }
9188 
9189 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
9190   switch(masklen) {
9191     case 8:
9192        kortestbl(src1, src2);
9193        break;
9194     case 16:
9195        kortestwl(src1, src2);
9196        break;
9197     case 32:
9198        kortestdl(src1, src2);
9199        break;
9200     case 64:
9201        kortestql(src1, src2);
9202        break;
9203     default:
9204       fatal("Unexpected mask length %d", masklen);
9205       break;
9206   }
9207 }
9208 
9209 
9210 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9211   switch(masklen)  {
9212     case 8:
9213        ktestbl(src1, src2);
9214        break;
9215     case 16:
9216        ktestwl(src1, src2);
9217        break;
9218     case 32:
9219        ktestdl(src1, src2);
9220        break;
9221     case 64:
9222        ktestql(src1, src2);
9223        break;
9224     default:
9225       fatal("Unexpected mask length %d", masklen);
9226       break;
9227   }
9228 }
9229 
9230 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9231   switch(type) {
9232     case T_INT:
9233       evprold(dst, mask, src, shift, merge, vlen_enc); break;
9234     case T_LONG:
9235       evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9236     default:
9237       fatal("Unexpected type argument %s", type2name(type)); break;
9238       break;
9239   }
9240 }
9241 
9242 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9243   switch(type) {
9244     case T_INT:
9245       evprord(dst, mask, src, shift, merge, vlen_enc); break;
9246     case T_LONG:
9247       evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9248     default:
9249       fatal("Unexpected type argument %s", type2name(type)); break;
9250   }
9251 }
9252 
9253 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9254   switch(type) {
9255     case T_INT:
9256       evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9257     case T_LONG:
9258       evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9259     default:
9260       fatal("Unexpected type argument %s", type2name(type)); break;
9261   }
9262 }
9263 
9264 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9265   switch(type) {
9266     case T_INT:
9267       evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9268     case T_LONG:
9269       evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9270     default:
9271       fatal("Unexpected type argument %s", type2name(type)); break;
9272   }
9273 }
9274 
9275 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9276   assert(rscratch != noreg || always_reachable(src), "missing");
9277 
9278   if (reachable(src)) {
9279     evpandq(dst, nds, as_Address(src), vector_len);
9280   } else {
9281     lea(rscratch, src);
9282     evpandq(dst, nds, Address(rscratch, 0), vector_len);
9283   }
9284 }
9285 
9286 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9287   assert(rscratch != noreg || always_reachable(src), "missing");
9288 
9289   if (reachable(src)) {
9290     Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9291   } else {
9292     lea(rscratch, src);
9293     Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9294   }
9295 }
9296 
9297 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9298   assert(rscratch != noreg || always_reachable(src), "missing");
9299 
9300   if (reachable(src)) {
9301     evporq(dst, nds, as_Address(src), vector_len);
9302   } else {
9303     lea(rscratch, src);
9304     evporq(dst, nds, Address(rscratch, 0), vector_len);
9305   }
9306 }
9307 
9308 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
9309   assert(rscratch != noreg || always_reachable(src3), "missing");
9310 
9311   if (reachable(src3)) {
9312     vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
9313   } else {
9314     lea(rscratch, src3);
9315     vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
9316   }
9317 }
9318 
9319 #if COMPILER2_OR_JVMCI
9320 
9321 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9322                                  Register length, Register temp, int vec_enc) {
9323   // Computing mask for predicated vector store.
9324   movptr(temp, -1);
9325   bzhiq(temp, temp, length);
9326   kmov(mask, temp);
9327   evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9328 }
9329 
9330 // Set memory operation for length "less than" 64 bytes.
9331 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9332                                        XMMRegister xmm, KRegister mask, Register length,
9333                                        Register temp, bool use64byteVector) {
9334   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9335   const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9336   if (!use64byteVector) {
9337     fill32(dst, disp, xmm);
9338     subptr(length, 32 >> shift);
9339     fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9340   } else {
9341     assert(MaxVectorSize == 64, "vector length != 64");
9342     fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9343   }
9344 }
9345 
9346 
9347 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9348                                        XMMRegister xmm, KRegister mask, Register length,
9349                                        Register temp) {
9350   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9351   const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9352   fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9353 }
9354 
9355 
9356 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9357   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9358   vmovdqu(dst, xmm);
9359 }
9360 
9361 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9362   fill32(Address(dst, disp), xmm);
9363 }
9364 
9365 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9366   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9367   if (!use64byteVector) {
9368     fill32(dst, xmm);
9369     fill32(dst.plus_disp(32), xmm);
9370   } else {
9371     evmovdquq(dst, xmm, Assembler::AVX_512bit);
9372   }
9373 }
9374 
9375 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9376   fill64(Address(dst, disp), xmm, use64byteVector);
9377 }
9378 
9379 #ifdef _LP64
9380 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9381                                         Register count, Register rtmp, XMMRegister xtmp) {
9382   Label L_exit;
9383   Label L_fill_start;
9384   Label L_fill_64_bytes;
9385   Label L_fill_96_bytes;
9386   Label L_fill_128_bytes;
9387   Label L_fill_128_bytes_loop;
9388   Label L_fill_128_loop_header;
9389   Label L_fill_128_bytes_loop_header;
9390   Label L_fill_128_bytes_loop_pre_header;
9391   Label L_fill_zmm_sequence;
9392 
9393   int shift = -1;
9394   int avx3threshold = VM_Version::avx3_threshold();
9395   switch(type) {
9396     case T_BYTE:  shift = 0;
9397       break;
9398     case T_SHORT: shift = 1;
9399       break;
9400     case T_INT:   shift = 2;
9401       break;
9402     /* Uncomment when LONG fill stubs are supported.
9403     case T_LONG:  shift = 3;
9404       break;
9405     */
9406     default:
9407       fatal("Unhandled type: %s\n", type2name(type));
9408   }
9409 
9410   if ((avx3threshold != 0)  || (MaxVectorSize == 32)) {
9411 
9412     if (MaxVectorSize == 64) {
9413       cmpq(count, avx3threshold >> shift);
9414       jcc(Assembler::greater, L_fill_zmm_sequence);
9415     }
9416 
9417     evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
9418 
9419     bind(L_fill_start);
9420 
9421     cmpq(count, 32 >> shift);
9422     jccb(Assembler::greater, L_fill_64_bytes);
9423     fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
9424     jmp(L_exit);
9425 
9426     bind(L_fill_64_bytes);
9427     cmpq(count, 64 >> shift);
9428     jccb(Assembler::greater, L_fill_96_bytes);
9429     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
9430     jmp(L_exit);
9431 
9432     bind(L_fill_96_bytes);
9433     cmpq(count, 96 >> shift);
9434     jccb(Assembler::greater, L_fill_128_bytes);
9435     fill64(to, 0, xtmp);
9436     subq(count, 64 >> shift);
9437     fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
9438     jmp(L_exit);
9439 
9440     bind(L_fill_128_bytes);
9441     cmpq(count, 128 >> shift);
9442     jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
9443     fill64(to, 0, xtmp);
9444     fill32(to, 64, xtmp);
9445     subq(count, 96 >> shift);
9446     fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
9447     jmp(L_exit);
9448 
9449     bind(L_fill_128_bytes_loop_pre_header);
9450     {
9451       mov(rtmp, to);
9452       andq(rtmp, 31);
9453       jccb(Assembler::zero, L_fill_128_bytes_loop_header);
9454       negq(rtmp);
9455       addq(rtmp, 32);
9456       mov64(r8, -1L);
9457       bzhiq(r8, r8, rtmp);
9458       kmovql(k2, r8);
9459       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
9460       addq(to, rtmp);
9461       shrq(rtmp, shift);
9462       subq(count, rtmp);
9463     }
9464 
9465     cmpq(count, 128 >> shift);
9466     jcc(Assembler::less, L_fill_start);
9467 
9468     bind(L_fill_128_bytes_loop_header);
9469     subq(count, 128 >> shift);
9470 
9471     align32();
9472     bind(L_fill_128_bytes_loop);
9473       fill64(to, 0, xtmp);
9474       fill64(to, 64, xtmp);
9475       addq(to, 128);
9476       subq(count, 128 >> shift);
9477       jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9478 
9479     addq(count, 128 >> shift);
9480     jcc(Assembler::zero, L_exit);
9481     jmp(L_fill_start);
9482   }
9483 
9484   if (MaxVectorSize == 64) {
9485     // Sequence using 64 byte ZMM register.
9486     Label L_fill_128_bytes_zmm;
9487     Label L_fill_192_bytes_zmm;
9488     Label L_fill_192_bytes_loop_zmm;
9489     Label L_fill_192_bytes_loop_header_zmm;
9490     Label L_fill_192_bytes_loop_pre_header_zmm;
9491     Label L_fill_start_zmm_sequence;
9492 
9493     bind(L_fill_zmm_sequence);
9494     evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9495 
9496     bind(L_fill_start_zmm_sequence);
9497     cmpq(count, 64 >> shift);
9498     jccb(Assembler::greater, L_fill_128_bytes_zmm);
9499     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9500     jmp(L_exit);
9501 
9502     bind(L_fill_128_bytes_zmm);
9503     cmpq(count, 128 >> shift);
9504     jccb(Assembler::greater, L_fill_192_bytes_zmm);
9505     fill64(to, 0, xtmp, true);
9506     subq(count, 64 >> shift);
9507     fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9508     jmp(L_exit);
9509 
9510     bind(L_fill_192_bytes_zmm);
9511     cmpq(count, 192 >> shift);
9512     jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9513     fill64(to, 0, xtmp, true);
9514     fill64(to, 64, xtmp, true);
9515     subq(count, 128 >> shift);
9516     fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9517     jmp(L_exit);
9518 
9519     bind(L_fill_192_bytes_loop_pre_header_zmm);
9520     {
9521       movq(rtmp, to);
9522       andq(rtmp, 63);
9523       jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9524       negq(rtmp);
9525       addq(rtmp, 64);
9526       mov64(r8, -1L);
9527       bzhiq(r8, r8, rtmp);
9528       kmovql(k2, r8);
9529       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9530       addq(to, rtmp);
9531       shrq(rtmp, shift);
9532       subq(count, rtmp);
9533     }
9534 
9535     cmpq(count, 192 >> shift);
9536     jcc(Assembler::less, L_fill_start_zmm_sequence);
9537 
9538     bind(L_fill_192_bytes_loop_header_zmm);
9539     subq(count, 192 >> shift);
9540 
9541     align32();
9542     bind(L_fill_192_bytes_loop_zmm);
9543       fill64(to, 0, xtmp, true);
9544       fill64(to, 64, xtmp, true);
9545       fill64(to, 128, xtmp, true);
9546       addq(to, 192);
9547       subq(count, 192 >> shift);
9548       jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9549 
9550     addq(count, 192 >> shift);
9551     jcc(Assembler::zero, L_exit);
9552     jmp(L_fill_start_zmm_sequence);
9553   }
9554   bind(L_exit);
9555 }
9556 #endif
9557 #endif //COMPILER2_OR_JVMCI
9558 
9559 
9560 #ifdef _LP64
9561 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9562   Label done;
9563   cvttss2sil(dst, src);
9564   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9565   cmpl(dst, 0x80000000); // float_sign_flip
9566   jccb(Assembler::notEqual, done);
9567   subptr(rsp, 8);
9568   movflt(Address(rsp, 0), src);
9569   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9570   pop(dst);
9571   bind(done);
9572 }
9573 
9574 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9575   Label done;
9576   cvttsd2sil(dst, src);
9577   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9578   cmpl(dst, 0x80000000); // float_sign_flip
9579   jccb(Assembler::notEqual, done);
9580   subptr(rsp, 8);
9581   movdbl(Address(rsp, 0), src);
9582   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9583   pop(dst);
9584   bind(done);
9585 }
9586 
9587 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9588   Label done;
9589   cvttss2siq(dst, src);
9590   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9591   jccb(Assembler::notEqual, done);
9592   subptr(rsp, 8);
9593   movflt(Address(rsp, 0), src);
9594   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9595   pop(dst);
9596   bind(done);
9597 }
9598 
9599 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9600   // Following code is line by line assembly translation rounding algorithm.
9601   // Please refer to java.lang.Math.round(float) algorithm for details.
9602   const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
9603   const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
9604   const int32_t FloatConsts_EXP_BIAS = 127;
9605   const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
9606   const int32_t MINUS_32 = 0xFFFFFFE0;
9607   Label L_special_case, L_block1, L_exit;
9608   movl(rtmp, FloatConsts_EXP_BIT_MASK);
9609   movdl(dst, src);
9610   andl(dst, rtmp);
9611   sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
9612   movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
9613   subl(rtmp, dst);
9614   movl(rcx, rtmp);
9615   movl(dst, MINUS_32);
9616   testl(rtmp, dst);
9617   jccb(Assembler::notEqual, L_special_case);
9618   movdl(dst, src);
9619   andl(dst, FloatConsts_SIGNIF_BIT_MASK);
9620   orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
9621   movdl(rtmp, src);
9622   testl(rtmp, rtmp);
9623   jccb(Assembler::greaterEqual, L_block1);
9624   negl(dst);
9625   bind(L_block1);
9626   sarl(dst);
9627   addl(dst, 0x1);
9628   sarl(dst, 0x1);
9629   jmp(L_exit);
9630   bind(L_special_case);
9631   convert_f2i(dst, src);
9632   bind(L_exit);
9633 }
9634 
9635 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9636   // Following code is line by line assembly translation rounding algorithm.
9637   // Please refer to java.lang.Math.round(double) algorithm for details.
9638   const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
9639   const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
9640   const int64_t DoubleConsts_EXP_BIAS = 1023;
9641   const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
9642   const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
9643   Label L_special_case, L_block1, L_exit;
9644   mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
9645   movq(dst, src);
9646   andq(dst, rtmp);
9647   sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
9648   mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
9649   subq(rtmp, dst);
9650   movq(rcx, rtmp);
9651   mov64(dst, MINUS_64);
9652   testq(rtmp, dst);
9653   jccb(Assembler::notEqual, L_special_case);
9654   movq(dst, src);
9655   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
9656   andq(dst, rtmp);
9657   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
9658   orq(dst, rtmp);
9659   movq(rtmp, src);
9660   testq(rtmp, rtmp);
9661   jccb(Assembler::greaterEqual, L_block1);
9662   negq(dst);
9663   bind(L_block1);
9664   sarq(dst);
9665   addq(dst, 0x1);
9666   sarq(dst, 0x1);
9667   jmp(L_exit);
9668   bind(L_special_case);
9669   convert_d2l(dst, src);
9670   bind(L_exit);
9671 }
9672 
9673 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9674   Label done;
9675   cvttsd2siq(dst, src);
9676   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9677   jccb(Assembler::notEqual, done);
9678   subptr(rsp, 8);
9679   movdbl(Address(rsp, 0), src);
9680   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9681   pop(dst);
9682   bind(done);
9683 }
9684 
9685 void MacroAssembler::cache_wb(Address line)
9686 {
9687   // 64 bit cpus always support clflush
9688   assert(VM_Version::supports_clflush(), "clflush should be available");
9689   bool optimized = VM_Version::supports_clflushopt();
9690   bool no_evict = VM_Version::supports_clwb();
9691 
9692   // prefer clwb (writeback without evict) otherwise
9693   // prefer clflushopt (potentially parallel writeback with evict)
9694   // otherwise fallback on clflush (serial writeback with evict)
9695 
9696   if (optimized) {
9697     if (no_evict) {
9698       clwb(line);
9699     } else {
9700       clflushopt(line);
9701     }
9702   } else {
9703     // no need for fence when using CLFLUSH
9704     clflush(line);
9705   }
9706 }
9707 
9708 void MacroAssembler::cache_wbsync(bool is_pre)
9709 {
9710   assert(VM_Version::supports_clflush(), "clflush should be available");
9711   bool optimized = VM_Version::supports_clflushopt();
9712   bool no_evict = VM_Version::supports_clwb();
9713 
9714   // pick the correct implementation
9715 
9716   if (!is_pre && (optimized || no_evict)) {
9717     // need an sfence for post flush when using clflushopt or clwb
9718     // otherwise no no need for any synchroniaztion
9719 
9720     sfence();
9721   }
9722 }
9723 
9724 #endif // _LP64
9725 
9726 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9727   switch (cond) {
9728     // Note some conditions are synonyms for others
9729     case Assembler::zero:         return Assembler::notZero;
9730     case Assembler::notZero:      return Assembler::zero;
9731     case Assembler::less:         return Assembler::greaterEqual;
9732     case Assembler::lessEqual:    return Assembler::greater;
9733     case Assembler::greater:      return Assembler::lessEqual;
9734     case Assembler::greaterEqual: return Assembler::less;
9735     case Assembler::below:        return Assembler::aboveEqual;
9736     case Assembler::belowEqual:   return Assembler::above;
9737     case Assembler::above:        return Assembler::belowEqual;
9738     case Assembler::aboveEqual:   return Assembler::below;
9739     case Assembler::overflow:     return Assembler::noOverflow;
9740     case Assembler::noOverflow:   return Assembler::overflow;
9741     case Assembler::negative:     return Assembler::positive;
9742     case Assembler::positive:     return Assembler::negative;
9743     case Assembler::parity:       return Assembler::noParity;
9744     case Assembler::noParity:     return Assembler::parity;
9745   }
9746   ShouldNotReachHere(); return Assembler::overflow;
9747 }
9748 
9749 SkipIfEqual::SkipIfEqual(
9750     MacroAssembler* masm, const bool* flag_addr, bool value, Register rscratch) {
9751   _masm = masm;
9752   _masm->cmp8(ExternalAddress((address)flag_addr), value, rscratch);
9753   _masm->jcc(Assembler::equal, _label);
9754 }
9755 
9756 SkipIfEqual::~SkipIfEqual() {
9757   _masm->bind(_label);
9758 }
9759 
9760 // 32-bit Windows has its own fast-path implementation
9761 // of get_thread
9762 #if !defined(WIN32) || defined(_LP64)
9763 
9764 // This is simply a call to Thread::current()
9765 void MacroAssembler::get_thread(Register thread) {
9766   if (thread != rax) {
9767     push(rax);
9768   }
9769   LP64_ONLY(push(rdi);)
9770   LP64_ONLY(push(rsi);)
9771   push(rdx);
9772   push(rcx);
9773 #ifdef _LP64
9774   push(r8);
9775   push(r9);
9776   push(r10);
9777   push(r11);
9778 #endif
9779 
9780   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9781 
9782 #ifdef _LP64
9783   pop(r11);
9784   pop(r10);
9785   pop(r9);
9786   pop(r8);
9787 #endif
9788   pop(rcx);
9789   pop(rdx);
9790   LP64_ONLY(pop(rsi);)
9791   LP64_ONLY(pop(rdi);)
9792   if (thread != rax) {
9793     mov(thread, rax);
9794     pop(rax);
9795   }
9796 }
9797 
9798 
9799 #endif // !WIN32 || _LP64
9800 
9801 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
9802   Label L_stack_ok;
9803   if (bias == 0) {
9804     testptr(sp, 2 * wordSize - 1);
9805   } else {
9806     // lea(tmp, Address(rsp, bias);
9807     mov(tmp, sp);
9808     addptr(tmp, bias);
9809     testptr(tmp, 2 * wordSize - 1);
9810   }
9811   jcc(Assembler::equal, L_stack_ok);
9812   block_comment(msg);
9813   stop(msg);
9814   bind(L_stack_ok);
9815 }
9816 
9817 // Implements lightweight-locking.
9818 // Branches to slow upon failure to lock the object, with ZF cleared.
9819 // Falls through upon success with unspecified ZF.
9820 //
9821 // obj: the object to be locked
9822 // hdr: the (pre-loaded) header of the object, must be rax
9823 // thread: the thread which attempts to lock obj
9824 // tmp: a temporary register
9825 void MacroAssembler::lightweight_lock(Register obj, Register hdr, Register thread, Register tmp, Label& slow) {
9826   assert(hdr == rax, "header must be in rax for cmpxchg");
9827   assert_different_registers(obj, hdr, thread, tmp);
9828 
9829   // First we need to check if the lock-stack has room for pushing the object reference.
9830   // Note: we subtract 1 from the end-offset so that we can do a 'greater' comparison, instead
9831   // of 'greaterEqual' below, which readily clears the ZF. This makes C2 code a little simpler and
9832   // avoids one branch.
9833   cmpl(Address(thread, JavaThread::lock_stack_top_offset()), LockStack::end_offset() - 1);
9834   jcc(Assembler::greater, slow);
9835 
9836   // Now we attempt to take the fast-lock.
9837   // Clear lock_mask bits (locked state).
9838   andptr(hdr, ~(int32_t)markWord::lock_mask_in_place);
9839   movptr(tmp, hdr);
9840   // Set unlocked_value bit.
9841   orptr(hdr, markWord::unlocked_value);
9842   lock();
9843   cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
9844   jcc(Assembler::notEqual, slow);
9845 
9846   // If successful, push object to lock-stack.
9847   movl(tmp, Address(thread, JavaThread::lock_stack_top_offset()));
9848   movptr(Address(thread, tmp), obj);
9849   incrementl(tmp, oopSize);
9850   movl(Address(thread, JavaThread::lock_stack_top_offset()), tmp);
9851 }
9852 
9853 // Implements lightweight-unlocking.
9854 // Branches to slow upon failure, with ZF cleared.
9855 // Falls through upon success, with unspecified ZF.
9856 //
9857 // obj: the object to be unlocked
9858 // hdr: the (pre-loaded) header of the object, must be rax
9859 // tmp: a temporary register
9860 void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Register tmp, Label& slow) {
9861   assert(hdr == rax, "header must be in rax for cmpxchg");
9862   assert_different_registers(obj, hdr, tmp);
9863 
9864   // Mark-word must be lock_mask now, try to swing it back to unlocked_value.
9865   movptr(tmp, hdr); // The expected old value
9866   orptr(tmp, markWord::unlocked_value);
9867   lock();
9868   cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
9869   jcc(Assembler::notEqual, slow);
9870   // Pop the lock object from the lock-stack.
9871 #ifdef _LP64
9872   const Register thread = r15_thread;
9873 #else
9874   const Register thread = rax;
9875   get_thread(thread);
9876 #endif
9877   subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
9878 #ifdef ASSERT
9879   movl(tmp, Address(thread, JavaThread::lock_stack_top_offset()));
9880   movptr(Address(thread, tmp), 0);
9881 #endif
9882 }