1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "compiler/compiler_globals.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "crc32c.h"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "jvm.h"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/continuation.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/javaThread.hpp"
  47 #include "runtime/jniHandles.hpp"
  48 #include "runtime/objectMonitor.hpp"
  49 #include "runtime/os.hpp"
  50 #include "runtime/safepoint.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/macros.hpp"
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #define STOP(error) stop(error)
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #define STOP(error) block_comment(error); stop(error)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 #ifdef ASSERT
  67 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  68 #endif
  69 
  70 static const Assembler::Condition reverse[] = {
  71     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  72     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  73     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  74     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  75     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  76     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  77     Assembler::above          /* belowEqual    = 0x6 */ ,
  78     Assembler::belowEqual     /* above         = 0x7 */ ,
  79     Assembler::positive       /* negative      = 0x8 */ ,
  80     Assembler::negative       /* positive      = 0x9 */ ,
  81     Assembler::noParity       /* parity        = 0xa */ ,
  82     Assembler::parity         /* noParity      = 0xb */ ,
  83     Assembler::greaterEqual   /* less          = 0xc */ ,
  84     Assembler::less           /* greaterEqual  = 0xd */ ,
  85     Assembler::greater        /* lessEqual     = 0xe */ ,
  86     Assembler::lessEqual      /* greater       = 0xf, */
  87 
  88 };
  89 
  90 
  91 // Implementation of MacroAssembler
  92 
  93 // First all the versions that have distinct versions depending on 32/64 bit
  94 // Unless the difference is trivial (1 line or so).
  95 
  96 #ifndef _LP64
  97 
  98 // 32bit versions
  99 
 100 Address MacroAssembler::as_Address(AddressLiteral adr) {
 101   return Address(adr.target(), adr.rspec());
 102 }
 103 
 104 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
 105   assert(rscratch == noreg, "");
 106   return Address::make_array(adr);
 107 }
 108 
 109 void MacroAssembler::call_VM_leaf_base(address entry_point,
 110                                        int number_of_arguments) {
 111   call(RuntimeAddress(entry_point));
 112   increment(rsp, number_of_arguments * wordSize);
 113 }
 114 
 115 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 116   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 117 }
 118 
 119 
 120 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 121   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 122 }
 123 
 124 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 125   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 126 }
 127 
 128 void MacroAssembler::cmpoop(Register src1, jobject obj, Register rscratch) {
 129   assert(rscratch == noreg, "redundant");
 130   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 131 }
 132 
 133 void MacroAssembler::extend_sign(Register hi, Register lo) {
 134   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 135   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 136     cdql();
 137   } else {
 138     movl(hi, lo);
 139     sarl(hi, 31);
 140   }
 141 }
 142 
 143 void MacroAssembler::jC2(Register tmp, Label& L) {
 144   // set parity bit if FPU flag C2 is set (via rax)
 145   save_rax(tmp);
 146   fwait(); fnstsw_ax();
 147   sahf();
 148   restore_rax(tmp);
 149   // branch
 150   jcc(Assembler::parity, L);
 151 }
 152 
 153 void MacroAssembler::jnC2(Register tmp, Label& L) {
 154   // set parity bit if FPU flag C2 is set (via rax)
 155   save_rax(tmp);
 156   fwait(); fnstsw_ax();
 157   sahf();
 158   restore_rax(tmp);
 159   // branch
 160   jcc(Assembler::noParity, L);
 161 }
 162 
 163 // 32bit can do a case table jump in one instruction but we no longer allow the base
 164 // to be installed in the Address class
 165 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
 166   assert(rscratch == noreg, "not needed");
 167   jmp(as_Address(entry, noreg));
 168 }
 169 
 170 // Note: y_lo will be destroyed
 171 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 172   // Long compare for Java (semantics as described in JVM spec.)
 173   Label high, low, done;
 174 
 175   cmpl(x_hi, y_hi);
 176   jcc(Assembler::less, low);
 177   jcc(Assembler::greater, high);
 178   // x_hi is the return register
 179   xorl(x_hi, x_hi);
 180   cmpl(x_lo, y_lo);
 181   jcc(Assembler::below, low);
 182   jcc(Assembler::equal, done);
 183 
 184   bind(high);
 185   xorl(x_hi, x_hi);
 186   increment(x_hi);
 187   jmp(done);
 188 
 189   bind(low);
 190   xorl(x_hi, x_hi);
 191   decrementl(x_hi);
 192 
 193   bind(done);
 194 }
 195 
 196 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 197   mov_literal32(dst, (int32_t)src.target(), src.rspec());
 198 }
 199 
 200 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
 201   assert(rscratch == noreg, "not needed");
 202 
 203   // leal(dst, as_Address(adr));
 204   // see note in movl as to why we must use a move
 205   mov_literal32(dst, (int32_t)adr.target(), adr.rspec());
 206 }
 207 
 208 void MacroAssembler::leave() {
 209   mov(rsp, rbp);
 210   pop(rbp);
 211 }
 212 
 213 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 214   // Multiplication of two Java long values stored on the stack
 215   // as illustrated below. Result is in rdx:rax.
 216   //
 217   // rsp ---> [  ??  ] \               \
 218   //            ....    | y_rsp_offset  |
 219   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 220   //          [ y_hi ]                  | (in bytes)
 221   //            ....                    |
 222   //          [ x_lo ]                 /
 223   //          [ x_hi ]
 224   //            ....
 225   //
 226   // Basic idea: lo(result) = lo(x_lo * y_lo)
 227   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 228   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 229   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 230   Label quick;
 231   // load x_hi, y_hi and check if quick
 232   // multiplication is possible
 233   movl(rbx, x_hi);
 234   movl(rcx, y_hi);
 235   movl(rax, rbx);
 236   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 237   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 238   // do full multiplication
 239   // 1st step
 240   mull(y_lo);                                    // x_hi * y_lo
 241   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 242   // 2nd step
 243   movl(rax, x_lo);
 244   mull(rcx);                                     // x_lo * y_hi
 245   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 246   // 3rd step
 247   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 248   movl(rax, x_lo);
 249   mull(y_lo);                                    // x_lo * y_lo
 250   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 251 }
 252 
 253 void MacroAssembler::lneg(Register hi, Register lo) {
 254   negl(lo);
 255   adcl(hi, 0);
 256   negl(hi);
 257 }
 258 
 259 void MacroAssembler::lshl(Register hi, Register lo) {
 260   // Java shift left long support (semantics as described in JVM spec., p.305)
 261   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 262   // shift value is in rcx !
 263   assert(hi != rcx, "must not use rcx");
 264   assert(lo != rcx, "must not use rcx");
 265   const Register s = rcx;                        // shift count
 266   const int      n = BitsPerWord;
 267   Label L;
 268   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 269   cmpl(s, n);                                    // if (s < n)
 270   jcc(Assembler::less, L);                       // else (s >= n)
 271   movl(hi, lo);                                  // x := x << n
 272   xorl(lo, lo);
 273   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 274   bind(L);                                       // s (mod n) < n
 275   shldl(hi, lo);                                 // x := x << s
 276   shll(lo);
 277 }
 278 
 279 
 280 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 281   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 282   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 283   assert(hi != rcx, "must not use rcx");
 284   assert(lo != rcx, "must not use rcx");
 285   const Register s = rcx;                        // shift count
 286   const int      n = BitsPerWord;
 287   Label L;
 288   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 289   cmpl(s, n);                                    // if (s < n)
 290   jcc(Assembler::less, L);                       // else (s >= n)
 291   movl(lo, hi);                                  // x := x >> n
 292   if (sign_extension) sarl(hi, 31);
 293   else                xorl(hi, hi);
 294   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 295   bind(L);                                       // s (mod n) < n
 296   shrdl(lo, hi);                                 // x := x >> s
 297   if (sign_extension) sarl(hi);
 298   else                shrl(hi);
 299 }
 300 
 301 void MacroAssembler::movoop(Register dst, jobject obj) {
 302   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 303 }
 304 
 305 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
 306   assert(rscratch == noreg, "redundant");
 307   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 308 }
 309 
 310 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 311   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 312 }
 313 
 314 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
 315   assert(rscratch == noreg, "redundant");
 316   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 317 }
 318 
 319 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
 320   if (src.is_lval()) {
 321     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 322   } else {
 323     movl(dst, as_Address(src));
 324   }
 325 }
 326 
 327 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
 328   assert(rscratch == noreg, "redundant");
 329   movl(as_Address(dst, noreg), src);
 330 }
 331 
 332 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 333   movl(dst, as_Address(src, noreg));
 334 }
 335 
 336 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
 337   assert(rscratch == noreg, "redundant");
 338   movl(dst, src);
 339 }
 340 
 341 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
 342   assert(rscratch == noreg, "redundant");
 343   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 344 }
 345 
 346 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
 347   assert(rscratch == noreg, "redundant");
 348   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 349 }
 350 
 351 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
 352   assert(rscratch == noreg, "redundant");
 353   if (src.is_lval()) {
 354     push_literal32((int32_t)src.target(), src.rspec());
 355   } else {
 356     pushl(as_Address(src));
 357   }
 358 }
 359 
 360 static void pass_arg0(MacroAssembler* masm, Register arg) {
 361   masm->push(arg);
 362 }
 363 
 364 static void pass_arg1(MacroAssembler* masm, Register arg) {
 365   masm->push(arg);
 366 }
 367 
 368 static void pass_arg2(MacroAssembler* masm, Register arg) {
 369   masm->push(arg);
 370 }
 371 
 372 static void pass_arg3(MacroAssembler* masm, Register arg) {
 373   masm->push(arg);
 374 }
 375 
 376 #ifndef PRODUCT
 377 extern "C" void findpc(intptr_t x);
 378 #endif
 379 
 380 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 381   // In order to get locks to work, we need to fake a in_VM state
 382   JavaThread* thread = JavaThread::current();
 383   JavaThreadState saved_state = thread->thread_state();
 384   thread->set_thread_state(_thread_in_vm);
 385   if (ShowMessageBoxOnError) {
 386     JavaThread* thread = JavaThread::current();
 387     JavaThreadState saved_state = thread->thread_state();
 388     thread->set_thread_state(_thread_in_vm);
 389     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 390       ttyLocker ttyl;
 391       BytecodeCounter::print();
 392     }
 393     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 394     // This is the value of eip which points to where verify_oop will return.
 395     if (os::message_box(msg, "Execution stopped, print registers?")) {
 396       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 397       BREAKPOINT;
 398     }
 399   }
 400   fatal("DEBUG MESSAGE: %s", msg);
 401 }
 402 
 403 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 404   ttyLocker ttyl;
 405   DebuggingContext debugging{};
 406   tty->print_cr("eip = 0x%08x", eip);
 407 #ifndef PRODUCT
 408   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 409     tty->cr();
 410     findpc(eip);
 411     tty->cr();
 412   }
 413 #endif
 414 #define PRINT_REG(rax) \
 415   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 416   PRINT_REG(rax);
 417   PRINT_REG(rbx);
 418   PRINT_REG(rcx);
 419   PRINT_REG(rdx);
 420   PRINT_REG(rdi);
 421   PRINT_REG(rsi);
 422   PRINT_REG(rbp);
 423   PRINT_REG(rsp);
 424 #undef PRINT_REG
 425   // Print some words near top of staack.
 426   int* dump_sp = (int*) rsp;
 427   for (int col1 = 0; col1 < 8; col1++) {
 428     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 429     os::print_location(tty, *dump_sp++);
 430   }
 431   for (int row = 0; row < 16; row++) {
 432     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 433     for (int col = 0; col < 8; col++) {
 434       tty->print(" 0x%08x", *dump_sp++);
 435     }
 436     tty->cr();
 437   }
 438   // Print some instructions around pc:
 439   Disassembler::decode((address)eip-64, (address)eip);
 440   tty->print_cr("--------");
 441   Disassembler::decode((address)eip, (address)eip+32);
 442 }
 443 
 444 void MacroAssembler::stop(const char* msg) {
 445   // push address of message
 446   ExternalAddress message((address)msg);
 447   pushptr(message.addr(), noreg);
 448   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 449   pusha();                                            // push registers
 450   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 451   hlt();
 452 }
 453 
 454 void MacroAssembler::warn(const char* msg) {
 455   push_CPU_state();
 456 
 457   // push address of message
 458   ExternalAddress message((address)msg);
 459   pushptr(message.addr(), noreg);
 460 
 461   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 462   addl(rsp, wordSize);       // discard argument
 463   pop_CPU_state();
 464 }
 465 
 466 void MacroAssembler::print_state() {
 467   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 468   pusha();                                            // push registers
 469 
 470   push_CPU_state();
 471   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 472   pop_CPU_state();
 473 
 474   popa();
 475   addl(rsp, wordSize);
 476 }
 477 
 478 #else // _LP64
 479 
 480 // 64 bit versions
 481 
 482 Address MacroAssembler::as_Address(AddressLiteral adr) {
 483   // amd64 always does this as a pc-rel
 484   // we can be absolute or disp based on the instruction type
 485   // jmp/call are displacements others are absolute
 486   assert(!adr.is_lval(), "must be rval");
 487   assert(reachable(adr), "must be");
 488   return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
 489 
 490 }
 491 
 492 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
 493   AddressLiteral base = adr.base();
 494   lea(rscratch, base);
 495   Address index = adr.index();
 496   assert(index._disp == 0, "must not have disp"); // maybe it can?
 497   Address array(rscratch, index._index, index._scale, index._disp);
 498   return array;
 499 }
 500 
 501 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 502   Label L, E;
 503 
 504 #ifdef _WIN64
 505   // Windows always allocates space for it's register args
 506   assert(num_args <= 4, "only register arguments supported");
 507   subq(rsp,  frame::arg_reg_save_area_bytes);
 508 #endif
 509 
 510   // Align stack if necessary
 511   testl(rsp, 15);
 512   jcc(Assembler::zero, L);
 513 
 514   subq(rsp, 8);
 515   call(RuntimeAddress(entry_point));
 516   addq(rsp, 8);
 517   jmp(E);
 518 
 519   bind(L);
 520   call(RuntimeAddress(entry_point));
 521 
 522   bind(E);
 523 
 524 #ifdef _WIN64
 525   // restore stack pointer
 526   addq(rsp, frame::arg_reg_save_area_bytes);
 527 #endif
 528 
 529 }
 530 
 531 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
 532   assert(!src2.is_lval(), "should use cmpptr");
 533   assert(rscratch != noreg || always_reachable(src2), "missing");
 534 
 535   if (reachable(src2)) {
 536     cmpq(src1, as_Address(src2));
 537   } else {
 538     lea(rscratch, src2);
 539     Assembler::cmpq(src1, Address(rscratch, 0));
 540   }
 541 }
 542 
 543 int MacroAssembler::corrected_idivq(Register reg) {
 544   // Full implementation of Java ldiv and lrem; checks for special
 545   // case as described in JVM spec., p.243 & p.271.  The function
 546   // returns the (pc) offset of the idivl instruction - may be needed
 547   // for implicit exceptions.
 548   //
 549   //         normal case                           special case
 550   //
 551   // input : rax: dividend                         min_long
 552   //         reg: divisor   (may not be eax/edx)   -1
 553   //
 554   // output: rax: quotient  (= rax idiv reg)       min_long
 555   //         rdx: remainder (= rax irem reg)       0
 556   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 557   static const int64_t min_long = 0x8000000000000000;
 558   Label normal_case, special_case;
 559 
 560   // check for special case
 561   cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
 562   jcc(Assembler::notEqual, normal_case);
 563   xorl(rdx, rdx); // prepare rdx for possible special case (where
 564                   // remainder = 0)
 565   cmpq(reg, -1);
 566   jcc(Assembler::equal, special_case);
 567 
 568   // handle normal case
 569   bind(normal_case);
 570   cdqq();
 571   int idivq_offset = offset();
 572   idivq(reg);
 573 
 574   // normal and special case exit
 575   bind(special_case);
 576 
 577   return idivq_offset;
 578 }
 579 
 580 void MacroAssembler::decrementq(Register reg, int value) {
 581   if (value == min_jint) { subq(reg, value); return; }
 582   if (value <  0) { incrementq(reg, -value); return; }
 583   if (value == 0) {                        ; return; }
 584   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 585   /* else */      { subq(reg, value)       ; return; }
 586 }
 587 
 588 void MacroAssembler::decrementq(Address dst, int value) {
 589   if (value == min_jint) { subq(dst, value); return; }
 590   if (value <  0) { incrementq(dst, -value); return; }
 591   if (value == 0) {                        ; return; }
 592   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 593   /* else */      { subq(dst, value)       ; return; }
 594 }
 595 
 596 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
 597   assert(rscratch != noreg || always_reachable(dst), "missing");
 598 
 599   if (reachable(dst)) {
 600     incrementq(as_Address(dst));
 601   } else {
 602     lea(rscratch, dst);
 603     incrementq(Address(rscratch, 0));
 604   }
 605 }
 606 
 607 void MacroAssembler::incrementq(Register reg, int value) {
 608   if (value == min_jint) { addq(reg, value); return; }
 609   if (value <  0) { decrementq(reg, -value); return; }
 610   if (value == 0) {                        ; return; }
 611   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 612   /* else */      { addq(reg, value)       ; return; }
 613 }
 614 
 615 void MacroAssembler::incrementq(Address dst, int value) {
 616   if (value == min_jint) { addq(dst, value); return; }
 617   if (value <  0) { decrementq(dst, -value); return; }
 618   if (value == 0) {                        ; return; }
 619   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 620   /* else */      { addq(dst, value)       ; return; }
 621 }
 622 
 623 // 32bit can do a case table jump in one instruction but we no longer allow the base
 624 // to be installed in the Address class
 625 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
 626   lea(rscratch, entry.base());
 627   Address dispatch = entry.index();
 628   assert(dispatch._base == noreg, "must be");
 629   dispatch._base = rscratch;
 630   jmp(dispatch);
 631 }
 632 
 633 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 634   ShouldNotReachHere(); // 64bit doesn't use two regs
 635   cmpq(x_lo, y_lo);
 636 }
 637 
 638 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 639   mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 640 }
 641 
 642 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
 643   lea(rscratch, adr);
 644   movptr(dst, rscratch);
 645 }
 646 
 647 void MacroAssembler::leave() {
 648   // %%% is this really better? Why not on 32bit too?
 649   emit_int8((unsigned char)0xC9); // LEAVE
 650 }
 651 
 652 void MacroAssembler::lneg(Register hi, Register lo) {
 653   ShouldNotReachHere(); // 64bit doesn't use two regs
 654   negq(lo);
 655 }
 656 
 657 void MacroAssembler::movoop(Register dst, jobject obj) {
 658   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 659 }
 660 
 661 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
 662   mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 663   movq(dst, rscratch);
 664 }
 665 
 666 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 667   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 668 }
 669 
 670 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
 671   mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 672   movq(dst, rscratch);
 673 }
 674 
 675 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
 676   if (src.is_lval()) {
 677     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 678   } else {
 679     if (reachable(src)) {
 680       movq(dst, as_Address(src));
 681     } else {
 682       lea(dst, src);
 683       movq(dst, Address(dst, 0));
 684     }
 685   }
 686 }
 687 
 688 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
 689   movq(as_Address(dst, rscratch), src);
 690 }
 691 
 692 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 693   movq(dst, as_Address(src, dst /*rscratch*/));
 694 }
 695 
 696 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 697 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
 698   if (is_simm32(src)) {
 699     movptr(dst, checked_cast<int32_t>(src));
 700   } else {
 701     mov64(rscratch, src);
 702     movq(dst, rscratch);
 703   }
 704 }
 705 
 706 void MacroAssembler::pushoop(jobject obj, Register rscratch) {
 707   movoop(rscratch, obj);
 708   push(rscratch);
 709 }
 710 
 711 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
 712   mov_metadata(rscratch, obj);
 713   push(rscratch);
 714 }
 715 
 716 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
 717   lea(rscratch, src);
 718   if (src.is_lval()) {
 719     push(rscratch);
 720   } else {
 721     pushq(Address(rscratch, 0));
 722   }
 723 }
 724 
 725 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 726   reset_last_Java_frame(r15_thread, clear_fp);
 727 }
 728 
 729 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 730                                          Register last_java_fp,
 731                                          address  last_java_pc,
 732                                          Register rscratch) {
 733   set_last_Java_frame(r15_thread, last_java_sp, last_java_fp, last_java_pc, rscratch);
 734 }
 735 
 736 static void pass_arg0(MacroAssembler* masm, Register arg) {
 737   if (c_rarg0 != arg ) {
 738     masm->mov(c_rarg0, arg);
 739   }
 740 }
 741 
 742 static void pass_arg1(MacroAssembler* masm, Register arg) {
 743   if (c_rarg1 != arg ) {
 744     masm->mov(c_rarg1, arg);
 745   }
 746 }
 747 
 748 static void pass_arg2(MacroAssembler* masm, Register arg) {
 749   if (c_rarg2 != arg ) {
 750     masm->mov(c_rarg2, arg);
 751   }
 752 }
 753 
 754 static void pass_arg3(MacroAssembler* masm, Register arg) {
 755   if (c_rarg3 != arg ) {
 756     masm->mov(c_rarg3, arg);
 757   }
 758 }
 759 
 760 void MacroAssembler::stop(const char* msg) {
 761   if (ShowMessageBoxOnError) {
 762     address rip = pc();
 763     pusha(); // get regs on stack
 764     lea(c_rarg1, InternalAddress(rip));
 765     movq(c_rarg2, rsp); // pass pointer to regs array
 766   }
 767   lea(c_rarg0, ExternalAddress((address) msg));
 768   andq(rsp, -16); // align stack as required by ABI
 769   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 770   hlt();
 771 }
 772 
 773 void MacroAssembler::warn(const char* msg) {
 774   push(rbp);
 775   movq(rbp, rsp);
 776   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 777   push_CPU_state();   // keeps alignment at 16 bytes
 778 
 779   lea(c_rarg0, ExternalAddress((address) msg));
 780   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 781 
 782   pop_CPU_state();
 783   mov(rsp, rbp);
 784   pop(rbp);
 785 }
 786 
 787 void MacroAssembler::print_state() {
 788   address rip = pc();
 789   pusha();            // get regs on stack
 790   push(rbp);
 791   movq(rbp, rsp);
 792   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 793   push_CPU_state();   // keeps alignment at 16 bytes
 794 
 795   lea(c_rarg0, InternalAddress(rip));
 796   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 797   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 798 
 799   pop_CPU_state();
 800   mov(rsp, rbp);
 801   pop(rbp);
 802   popa();
 803 }
 804 
 805 #ifndef PRODUCT
 806 extern "C" void findpc(intptr_t x);
 807 #endif
 808 
 809 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 810   // In order to get locks to work, we need to fake a in_VM state
 811   if (ShowMessageBoxOnError) {
 812     JavaThread* thread = JavaThread::current();
 813     JavaThreadState saved_state = thread->thread_state();
 814     thread->set_thread_state(_thread_in_vm);
 815 #ifndef PRODUCT
 816     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 817       ttyLocker ttyl;
 818       BytecodeCounter::print();
 819     }
 820 #endif
 821     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 822     // XXX correct this offset for amd64
 823     // This is the value of eip which points to where verify_oop will return.
 824     if (os::message_box(msg, "Execution stopped, print registers?")) {
 825       print_state64(pc, regs);
 826       BREAKPOINT;
 827     }
 828   }
 829   fatal("DEBUG MESSAGE: %s", msg);
 830 }
 831 
 832 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 833   ttyLocker ttyl;
 834   DebuggingContext debugging{};
 835   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 836 #ifndef PRODUCT
 837   tty->cr();
 838   findpc(pc);
 839   tty->cr();
 840 #endif
 841 #define PRINT_REG(rax, value) \
 842   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 843   PRINT_REG(rax, regs[15]);
 844   PRINT_REG(rbx, regs[12]);
 845   PRINT_REG(rcx, regs[14]);
 846   PRINT_REG(rdx, regs[13]);
 847   PRINT_REG(rdi, regs[8]);
 848   PRINT_REG(rsi, regs[9]);
 849   PRINT_REG(rbp, regs[10]);
 850   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 851   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 852   PRINT_REG(r8 , regs[7]);
 853   PRINT_REG(r9 , regs[6]);
 854   PRINT_REG(r10, regs[5]);
 855   PRINT_REG(r11, regs[4]);
 856   PRINT_REG(r12, regs[3]);
 857   PRINT_REG(r13, regs[2]);
 858   PRINT_REG(r14, regs[1]);
 859   PRINT_REG(r15, regs[0]);
 860 #undef PRINT_REG
 861   // Print some words near the top of the stack.
 862   int64_t* rsp = &regs[16];
 863   int64_t* dump_sp = rsp;
 864   for (int col1 = 0; col1 < 8; col1++) {
 865     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 866     os::print_location(tty, *dump_sp++);
 867   }
 868   for (int row = 0; row < 25; row++) {
 869     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 870     for (int col = 0; col < 4; col++) {
 871       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 872     }
 873     tty->cr();
 874   }
 875   // Print some instructions around pc:
 876   Disassembler::decode((address)pc-64, (address)pc);
 877   tty->print_cr("--------");
 878   Disassembler::decode((address)pc, (address)pc+32);
 879 }
 880 
 881 // The java_calling_convention describes stack locations as ideal slots on
 882 // a frame with no abi restrictions. Since we must observe abi restrictions
 883 // (like the placement of the register window) the slots must be biased by
 884 // the following value.
 885 static int reg2offset_in(VMReg r) {
 886   // Account for saved rbp and return address
 887   // This should really be in_preserve_stack_slots
 888   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 889 }
 890 
 891 static int reg2offset_out(VMReg r) {
 892   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 893 }
 894 
 895 // A long move
 896 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 897 
 898   // The calling conventions assures us that each VMregpair is either
 899   // all really one physical register or adjacent stack slots.
 900 
 901   if (src.is_single_phys_reg() ) {
 902     if (dst.is_single_phys_reg()) {
 903       if (dst.first() != src.first()) {
 904         mov(dst.first()->as_Register(), src.first()->as_Register());
 905       }
 906     } else {
 907       assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
 908              src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
 909       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
 910     }
 911   } else if (dst.is_single_phys_reg()) {
 912     assert(src.is_single_reg(),  "not a stack pair");
 913     movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 914   } else {
 915     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 916     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 917     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 918   }
 919 }
 920 
 921 // A double move
 922 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 923 
 924   // The calling conventions assures us that each VMregpair is either
 925   // all really one physical register or adjacent stack slots.
 926 
 927   if (src.is_single_phys_reg() ) {
 928     if (dst.is_single_phys_reg()) {
 929       // In theory these overlap but the ordering is such that this is likely a nop
 930       if ( src.first() != dst.first()) {
 931         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 932       }
 933     } else {
 934       assert(dst.is_single_reg(), "not a stack pair");
 935       movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
 936     }
 937   } else if (dst.is_single_phys_reg()) {
 938     assert(src.is_single_reg(),  "not a stack pair");
 939     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 940   } else {
 941     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 942     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 943     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 944   }
 945 }
 946 
 947 
 948 // A float arg may have to do float reg int reg conversion
 949 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 950   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 951 
 952   // The calling conventions assures us that each VMregpair is either
 953   // all really one physical register or adjacent stack slots.
 954 
 955   if (src.first()->is_stack()) {
 956     if (dst.first()->is_stack()) {
 957       movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 958       movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 959     } else {
 960       // stack to reg
 961       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 962       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 963     }
 964   } else if (dst.first()->is_stack()) {
 965     // reg to stack
 966     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 967     movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
 968   } else {
 969     // reg to reg
 970     // In theory these overlap but the ordering is such that this is likely a nop
 971     if ( src.first() != dst.first()) {
 972       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
 973     }
 974   }
 975 }
 976 
 977 // On 64 bit we will store integer like items to the stack as
 978 // 64 bits items (x86_32/64 abi) even though java would only store
 979 // 32bits for a parameter. On 32bit it will simply be 32 bits
 980 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
 981 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 982   if (src.first()->is_stack()) {
 983     if (dst.first()->is_stack()) {
 984       // stack to stack
 985       movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 986       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 987     } else {
 988       // stack to reg
 989       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 990     }
 991   } else if (dst.first()->is_stack()) {
 992     // reg to stack
 993     // Do we really have to sign extend???
 994     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
 995     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
 996   } else {
 997     // Do we really have to sign extend???
 998     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
 999     if (dst.first() != src.first()) {
1000       movq(dst.first()->as_Register(), src.first()->as_Register());
1001     }
1002   }
1003 }
1004 
1005 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1006   if (src.first()->is_stack()) {
1007     if (dst.first()->is_stack()) {
1008       // stack to stack
1009       movq(rax, Address(rbp, reg2offset_in(src.first())));
1010       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1011     } else {
1012       // stack to reg
1013       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1014     }
1015   } else if (dst.first()->is_stack()) {
1016     // reg to stack
1017     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1018   } else {
1019     if (dst.first() != src.first()) {
1020       movq(dst.first()->as_Register(), src.first()->as_Register());
1021     }
1022   }
1023 }
1024 
1025 // An oop arg. Must pass a handle not the oop itself
1026 void MacroAssembler::object_move(OopMap* map,
1027                         int oop_handle_offset,
1028                         int framesize_in_slots,
1029                         VMRegPair src,
1030                         VMRegPair dst,
1031                         bool is_receiver,
1032                         int* receiver_offset) {
1033 
1034   // must pass a handle. First figure out the location we use as a handle
1035 
1036   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1037 
1038   // See if oop is null if it is we need no handle
1039 
1040   if (src.first()->is_stack()) {
1041 
1042     // Oop is already on the stack as an argument
1043     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1044     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1045     if (is_receiver) {
1046       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1047     }
1048 
1049     cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
1050     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1051     // conditionally move a null
1052     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1053   } else {
1054 
1055     // Oop is in a register we must store it to the space we reserve
1056     // on the stack for oop_handles and pass a handle if oop is non-null
1057 
1058     const Register rOop = src.first()->as_Register();
1059     int oop_slot;
1060     if (rOop == j_rarg0)
1061       oop_slot = 0;
1062     else if (rOop == j_rarg1)
1063       oop_slot = 1;
1064     else if (rOop == j_rarg2)
1065       oop_slot = 2;
1066     else if (rOop == j_rarg3)
1067       oop_slot = 3;
1068     else if (rOop == j_rarg4)
1069       oop_slot = 4;
1070     else {
1071       assert(rOop == j_rarg5, "wrong register");
1072       oop_slot = 5;
1073     }
1074 
1075     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1076     int offset = oop_slot*VMRegImpl::stack_slot_size;
1077 
1078     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1079     // Store oop in handle area, may be null
1080     movptr(Address(rsp, offset), rOop);
1081     if (is_receiver) {
1082       *receiver_offset = offset;
1083     }
1084 
1085     cmpptr(rOop, NULL_WORD);
1086     lea(rHandle, Address(rsp, offset));
1087     // conditionally move a null from the handle area where it was just stored
1088     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1089   }
1090 
1091   // If arg is on the stack then place it otherwise it is already in correct reg.
1092   if (dst.first()->is_stack()) {
1093     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1094   }
1095 }
1096 
1097 #endif // _LP64
1098 
1099 // Now versions that are common to 32/64 bit
1100 
1101 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1102   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1103 }
1104 
1105 void MacroAssembler::addptr(Register dst, Register src) {
1106   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1107 }
1108 
1109 void MacroAssembler::addptr(Address dst, Register src) {
1110   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1111 }
1112 
1113 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1114   assert(rscratch != noreg || always_reachable(src), "missing");
1115 
1116   if (reachable(src)) {
1117     Assembler::addsd(dst, as_Address(src));
1118   } else {
1119     lea(rscratch, src);
1120     Assembler::addsd(dst, Address(rscratch, 0));
1121   }
1122 }
1123 
1124 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1125   assert(rscratch != noreg || always_reachable(src), "missing");
1126 
1127   if (reachable(src)) {
1128     addss(dst, as_Address(src));
1129   } else {
1130     lea(rscratch, src);
1131     addss(dst, Address(rscratch, 0));
1132   }
1133 }
1134 
1135 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1136   assert(rscratch != noreg || always_reachable(src), "missing");
1137 
1138   if (reachable(src)) {
1139     Assembler::addpd(dst, as_Address(src));
1140   } else {
1141     lea(rscratch, src);
1142     Assembler::addpd(dst, Address(rscratch, 0));
1143   }
1144 }
1145 
1146 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1147 // Stub code is generated once and never copied.
1148 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1149 void MacroAssembler::align64() {
1150   align(64, (unsigned long long) pc());
1151 }
1152 
1153 void MacroAssembler::align32() {
1154   align(32, (unsigned long long) pc());
1155 }
1156 
1157 void MacroAssembler::align(int modulus) {
1158   // 8273459: Ensure alignment is possible with current segment alignment
1159   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1160   align(modulus, offset());
1161 }
1162 
1163 void MacroAssembler::align(int modulus, int target) {
1164   if (target % modulus != 0) {
1165     nop(modulus - (target % modulus));
1166   }
1167 }
1168 
1169 void MacroAssembler::push_f(XMMRegister r) {
1170   subptr(rsp, wordSize);
1171   movflt(Address(rsp, 0), r);
1172 }
1173 
1174 void MacroAssembler::pop_f(XMMRegister r) {
1175   movflt(r, Address(rsp, 0));
1176   addptr(rsp, wordSize);
1177 }
1178 
1179 void MacroAssembler::push_d(XMMRegister r) {
1180   subptr(rsp, 2 * wordSize);
1181   movdbl(Address(rsp, 0), r);
1182 }
1183 
1184 void MacroAssembler::pop_d(XMMRegister r) {
1185   movdbl(r, Address(rsp, 0));
1186   addptr(rsp, 2 * Interpreter::stackElementSize);
1187 }
1188 
1189 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1190   // Used in sign-masking with aligned address.
1191   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1192   assert(rscratch != noreg || always_reachable(src), "missing");
1193 
1194   if (reachable(src)) {
1195     Assembler::andpd(dst, as_Address(src));
1196   } else {
1197     lea(rscratch, src);
1198     Assembler::andpd(dst, Address(rscratch, 0));
1199   }
1200 }
1201 
1202 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
1203   // Used in sign-masking with aligned address.
1204   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1205   assert(rscratch != noreg || always_reachable(src), "missing");
1206 
1207   if (reachable(src)) {
1208     Assembler::andps(dst, as_Address(src));
1209   } else {
1210     lea(rscratch, src);
1211     Assembler::andps(dst, Address(rscratch, 0));
1212   }
1213 }
1214 
1215 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1216   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1217 }
1218 
1219 #ifdef _LP64
1220 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
1221   assert(rscratch != noreg || always_reachable(src), "missing");
1222 
1223   if (reachable(src)) {
1224     andq(dst, as_Address(src));
1225   } else {
1226     lea(rscratch, src);
1227     andq(dst, Address(rscratch, 0));
1228   }
1229 }
1230 #endif
1231 
1232 void MacroAssembler::atomic_incl(Address counter_addr) {
1233   lock();
1234   incrementl(counter_addr);
1235 }
1236 
1237 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
1238   assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1239 
1240   if (reachable(counter_addr)) {
1241     atomic_incl(as_Address(counter_addr));
1242   } else {
1243     lea(rscratch, counter_addr);
1244     atomic_incl(Address(rscratch, 0));
1245   }
1246 }
1247 
1248 #ifdef _LP64
1249 void MacroAssembler::atomic_incq(Address counter_addr) {
1250   lock();
1251   incrementq(counter_addr);
1252 }
1253 
1254 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
1255   assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1256 
1257   if (reachable(counter_addr)) {
1258     atomic_incq(as_Address(counter_addr));
1259   } else {
1260     lea(rscratch, counter_addr);
1261     atomic_incq(Address(rscratch, 0));
1262   }
1263 }
1264 #endif
1265 
1266 // Writes to stack successive pages until offset reached to check for
1267 // stack overflow + shadow pages.  This clobbers tmp.
1268 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1269   movptr(tmp, rsp);
1270   // Bang stack for total size given plus shadow page size.
1271   // Bang one page at a time because large size can bang beyond yellow and
1272   // red zones.
1273   Label loop;
1274   bind(loop);
1275   movl(Address(tmp, (-(int)os::vm_page_size())), size );
1276   subptr(tmp, (int)os::vm_page_size());
1277   subl(size, (int)os::vm_page_size());
1278   jcc(Assembler::greater, loop);
1279 
1280   // Bang down shadow pages too.
1281   // At this point, (tmp-0) is the last address touched, so don't
1282   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1283   // was post-decremented.)  Skip this address by starting at i=1, and
1284   // touch a few more pages below.  N.B.  It is important to touch all
1285   // the way down including all pages in the shadow zone.
1286   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
1287     // this could be any sized move but this is can be a debugging crumb
1288     // so the bigger the better.
1289     movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
1290   }
1291 }
1292 
1293 void MacroAssembler::reserved_stack_check() {
1294   // testing if reserved zone needs to be enabled
1295   Label no_reserved_zone_enabling;
1296   Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1297   NOT_LP64(get_thread(rsi);)
1298 
1299   cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1300   jcc(Assembler::below, no_reserved_zone_enabling);
1301 
1302   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1303   jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1304   should_not_reach_here();
1305 
1306   bind(no_reserved_zone_enabling);
1307 }
1308 
1309 void MacroAssembler::c2bool(Register x) {
1310   // implements x == 0 ? 0 : 1
1311   // note: must only look at least-significant byte of x
1312   //       since C-style booleans are stored in one byte
1313   //       only! (was bug)
1314   andl(x, 0xFF);
1315   setb(Assembler::notZero, x);
1316 }
1317 
1318 // Wouldn't need if AddressLiteral version had new name
1319 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1320   Assembler::call(L, rtype);
1321 }
1322 
1323 void MacroAssembler::call(Register entry) {
1324   Assembler::call(entry);
1325 }
1326 
1327 void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
1328   assert(rscratch != noreg || always_reachable(entry), "missing");
1329 
1330   if (reachable(entry)) {
1331     Assembler::call_literal(entry.target(), entry.rspec());
1332   } else {
1333     lea(rscratch, entry);
1334     Assembler::call(rscratch);
1335   }
1336 }
1337 
1338 void MacroAssembler::ic_call(address entry, jint method_index) {
1339   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1340 #ifdef _LP64
1341   // Needs full 64-bit immediate for later patching.
1342   mov64(rax, (intptr_t)Universe::non_oop_word());
1343 #else
1344   movptr(rax, (intptr_t)Universe::non_oop_word());
1345 #endif
1346   call(AddressLiteral(entry, rh));
1347 }
1348 
1349 void MacroAssembler::emit_static_call_stub() {
1350   // Static stub relocation also tags the Method* in the code-stream.
1351   mov_metadata(rbx, (Metadata*) nullptr);  // Method is zapped till fixup time.
1352   // This is recognized as unresolved by relocs/nativeinst/ic code.
1353   jump(RuntimeAddress(pc()));
1354 }
1355 
1356 // Implementation of call_VM versions
1357 
1358 void MacroAssembler::call_VM(Register oop_result,
1359                              address entry_point,
1360                              bool check_exceptions) {
1361   Label C, E;
1362   call(C, relocInfo::none);
1363   jmp(E);
1364 
1365   bind(C);
1366   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1367   ret(0);
1368 
1369   bind(E);
1370 }
1371 
1372 void MacroAssembler::call_VM(Register oop_result,
1373                              address entry_point,
1374                              Register arg_1,
1375                              bool check_exceptions) {
1376   Label C, E;
1377   call(C, relocInfo::none);
1378   jmp(E);
1379 
1380   bind(C);
1381   pass_arg1(this, arg_1);
1382   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1383   ret(0);
1384 
1385   bind(E);
1386 }
1387 
1388 void MacroAssembler::call_VM(Register oop_result,
1389                              address entry_point,
1390                              Register arg_1,
1391                              Register arg_2,
1392                              bool check_exceptions) {
1393   Label C, E;
1394   call(C, relocInfo::none);
1395   jmp(E);
1396 
1397   bind(C);
1398 
1399   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1400 
1401   pass_arg2(this, arg_2);
1402   pass_arg1(this, arg_1);
1403   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1404   ret(0);
1405 
1406   bind(E);
1407 }
1408 
1409 void MacroAssembler::call_VM(Register oop_result,
1410                              address entry_point,
1411                              Register arg_1,
1412                              Register arg_2,
1413                              Register arg_3,
1414                              bool check_exceptions) {
1415   Label C, E;
1416   call(C, relocInfo::none);
1417   jmp(E);
1418 
1419   bind(C);
1420 
1421   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1422   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1423   pass_arg3(this, arg_3);
1424 
1425   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1426   pass_arg2(this, arg_2);
1427 
1428   pass_arg1(this, arg_1);
1429   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1430   ret(0);
1431 
1432   bind(E);
1433 }
1434 
1435 void MacroAssembler::call_VM(Register oop_result,
1436                              Register last_java_sp,
1437                              address entry_point,
1438                              int number_of_arguments,
1439                              bool check_exceptions) {
1440   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1441   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1442 }
1443 
1444 void MacroAssembler::call_VM(Register oop_result,
1445                              Register last_java_sp,
1446                              address entry_point,
1447                              Register arg_1,
1448                              bool check_exceptions) {
1449   pass_arg1(this, arg_1);
1450   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1451 }
1452 
1453 void MacroAssembler::call_VM(Register oop_result,
1454                              Register last_java_sp,
1455                              address entry_point,
1456                              Register arg_1,
1457                              Register arg_2,
1458                              bool check_exceptions) {
1459 
1460   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1461   pass_arg2(this, arg_2);
1462   pass_arg1(this, arg_1);
1463   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1464 }
1465 
1466 void MacroAssembler::call_VM(Register oop_result,
1467                              Register last_java_sp,
1468                              address entry_point,
1469                              Register arg_1,
1470                              Register arg_2,
1471                              Register arg_3,
1472                              bool check_exceptions) {
1473   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1474   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1475   pass_arg3(this, arg_3);
1476   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1477   pass_arg2(this, arg_2);
1478   pass_arg1(this, arg_1);
1479   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1480 }
1481 
1482 void MacroAssembler::super_call_VM(Register oop_result,
1483                                    Register last_java_sp,
1484                                    address entry_point,
1485                                    int number_of_arguments,
1486                                    bool check_exceptions) {
1487   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1488   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1489 }
1490 
1491 void MacroAssembler::super_call_VM(Register oop_result,
1492                                    Register last_java_sp,
1493                                    address entry_point,
1494                                    Register arg_1,
1495                                    bool check_exceptions) {
1496   pass_arg1(this, arg_1);
1497   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1498 }
1499 
1500 void MacroAssembler::super_call_VM(Register oop_result,
1501                                    Register last_java_sp,
1502                                    address entry_point,
1503                                    Register arg_1,
1504                                    Register arg_2,
1505                                    bool check_exceptions) {
1506 
1507   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1508   pass_arg2(this, arg_2);
1509   pass_arg1(this, arg_1);
1510   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1511 }
1512 
1513 void MacroAssembler::super_call_VM(Register oop_result,
1514                                    Register last_java_sp,
1515                                    address entry_point,
1516                                    Register arg_1,
1517                                    Register arg_2,
1518                                    Register arg_3,
1519                                    bool check_exceptions) {
1520   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1521   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1522   pass_arg3(this, arg_3);
1523   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1524   pass_arg2(this, arg_2);
1525   pass_arg1(this, arg_1);
1526   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1527 }
1528 
1529 void MacroAssembler::call_VM_base(Register oop_result,
1530                                   Register java_thread,
1531                                   Register last_java_sp,
1532                                   address  entry_point,
1533                                   int      number_of_arguments,
1534                                   bool     check_exceptions) {
1535   // determine java_thread register
1536   if (!java_thread->is_valid()) {
1537 #ifdef _LP64
1538     java_thread = r15_thread;
1539 #else
1540     java_thread = rdi;
1541     get_thread(java_thread);
1542 #endif // LP64
1543   }
1544   // determine last_java_sp register
1545   if (!last_java_sp->is_valid()) {
1546     last_java_sp = rsp;
1547   }
1548   // debugging support
1549   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1550   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1551 #ifdef ASSERT
1552   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1553   // r12 is the heapbase.
1554   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1555 #endif // ASSERT
1556 
1557   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1558   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1559 
1560   // push java thread (becomes first argument of C function)
1561 
1562   NOT_LP64(push(java_thread); number_of_arguments++);
1563   LP64_ONLY(mov(c_rarg0, r15_thread));
1564 
1565   // set last Java frame before call
1566   assert(last_java_sp != rbp, "can't use ebp/rbp");
1567 
1568   // Only interpreter should have to set fp
1569   set_last_Java_frame(java_thread, last_java_sp, rbp, nullptr, rscratch1);
1570 
1571   // do the call, remove parameters
1572   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1573 
1574   // restore the thread (cannot use the pushed argument since arguments
1575   // may be overwritten by C code generated by an optimizing compiler);
1576   // however can use the register value directly if it is callee saved.
1577   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1578     // rdi & rsi (also r15) are callee saved -> nothing to do
1579 #ifdef ASSERT
1580     guarantee(java_thread != rax, "change this code");
1581     push(rax);
1582     { Label L;
1583       get_thread(rax);
1584       cmpptr(java_thread, rax);
1585       jcc(Assembler::equal, L);
1586       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1587       bind(L);
1588     }
1589     pop(rax);
1590 #endif
1591   } else {
1592     get_thread(java_thread);
1593   }
1594   // reset last Java frame
1595   // Only interpreter should have to clear fp
1596   reset_last_Java_frame(java_thread, true);
1597 
1598    // C++ interp handles this in the interpreter
1599   check_and_handle_popframe(java_thread);
1600   check_and_handle_earlyret(java_thread);
1601 
1602   if (check_exceptions) {
1603     // check for pending exceptions (java_thread is set upon return)
1604     cmpptr(Address(java_thread, Thread::pending_exception_offset()), NULL_WORD);
1605 #ifndef _LP64
1606     jump_cc(Assembler::notEqual,
1607             RuntimeAddress(StubRoutines::forward_exception_entry()));
1608 #else
1609     // This used to conditionally jump to forward_exception however it is
1610     // possible if we relocate that the branch will not reach. So we must jump
1611     // around so we can always reach
1612 
1613     Label ok;
1614     jcc(Assembler::equal, ok);
1615     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1616     bind(ok);
1617 #endif // LP64
1618   }
1619 
1620   // get oop result if there is one and reset the value in the thread
1621   if (oop_result->is_valid()) {
1622     get_vm_result(oop_result, java_thread);
1623   }
1624 }
1625 
1626 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1627 
1628   // Calculate the value for last_Java_sp
1629   // somewhat subtle. call_VM does an intermediate call
1630   // which places a return address on the stack just under the
1631   // stack pointer as the user finished with it. This allows
1632   // use to retrieve last_Java_pc from last_Java_sp[-1].
1633   // On 32bit we then have to push additional args on the stack to accomplish
1634   // the actual requested call. On 64bit call_VM only can use register args
1635   // so the only extra space is the return address that call_VM created.
1636   // This hopefully explains the calculations here.
1637 
1638 #ifdef _LP64
1639   // We've pushed one address, correct last_Java_sp
1640   lea(rax, Address(rsp, wordSize));
1641 #else
1642   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1643 #endif // LP64
1644 
1645   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1646 
1647 }
1648 
1649 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1650 void MacroAssembler::call_VM_leaf0(address entry_point) {
1651   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1652 }
1653 
1654 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1655   call_VM_leaf_base(entry_point, number_of_arguments);
1656 }
1657 
1658 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1659   pass_arg0(this, arg_0);
1660   call_VM_leaf(entry_point, 1);
1661 }
1662 
1663 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1664 
1665   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1666   pass_arg1(this, arg_1);
1667   pass_arg0(this, arg_0);
1668   call_VM_leaf(entry_point, 2);
1669 }
1670 
1671 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1672   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1673   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1674   pass_arg2(this, arg_2);
1675   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1676   pass_arg1(this, arg_1);
1677   pass_arg0(this, arg_0);
1678   call_VM_leaf(entry_point, 3);
1679 }
1680 
1681 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1682   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1683   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1684   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1685   pass_arg3(this, arg_3);
1686   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1687   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1688   pass_arg2(this, arg_2);
1689   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1690   pass_arg1(this, arg_1);
1691   pass_arg0(this, arg_0);
1692   call_VM_leaf(entry_point, 3);
1693 }
1694 
1695 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1696   pass_arg0(this, arg_0);
1697   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1698 }
1699 
1700 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1701 
1702   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1703   pass_arg1(this, arg_1);
1704   pass_arg0(this, arg_0);
1705   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1706 }
1707 
1708 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1709   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1710   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1711   pass_arg2(this, arg_2);
1712   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1713   pass_arg1(this, arg_1);
1714   pass_arg0(this, arg_0);
1715   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1716 }
1717 
1718 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1719   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1720   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1721   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1722   pass_arg3(this, arg_3);
1723   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1724   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1725   pass_arg2(this, arg_2);
1726   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1727   pass_arg1(this, arg_1);
1728   pass_arg0(this, arg_0);
1729   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1730 }
1731 
1732 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1733   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1734   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1735   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1736 }
1737 
1738 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1739   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1740   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1741 }
1742 
1743 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1744 }
1745 
1746 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1747 }
1748 
1749 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1750   assert(rscratch != noreg || always_reachable(src1), "missing");
1751 
1752   if (reachable(src1)) {
1753     cmpl(as_Address(src1), imm);
1754   } else {
1755     lea(rscratch, src1);
1756     cmpl(Address(rscratch, 0), imm);
1757   }
1758 }
1759 
1760 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1761   assert(!src2.is_lval(), "use cmpptr");
1762   assert(rscratch != noreg || always_reachable(src2), "missing");
1763 
1764   if (reachable(src2)) {
1765     cmpl(src1, as_Address(src2));
1766   } else {
1767     lea(rscratch, src2);
1768     cmpl(src1, Address(rscratch, 0));
1769   }
1770 }
1771 
1772 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1773   Assembler::cmpl(src1, imm);
1774 }
1775 
1776 void MacroAssembler::cmp32(Register src1, Address src2) {
1777   Assembler::cmpl(src1, src2);
1778 }
1779 
1780 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1781   ucomisd(opr1, opr2);
1782 
1783   Label L;
1784   if (unordered_is_less) {
1785     movl(dst, -1);
1786     jcc(Assembler::parity, L);
1787     jcc(Assembler::below , L);
1788     movl(dst, 0);
1789     jcc(Assembler::equal , L);
1790     increment(dst);
1791   } else { // unordered is greater
1792     movl(dst, 1);
1793     jcc(Assembler::parity, L);
1794     jcc(Assembler::above , L);
1795     movl(dst, 0);
1796     jcc(Assembler::equal , L);
1797     decrementl(dst);
1798   }
1799   bind(L);
1800 }
1801 
1802 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1803   ucomiss(opr1, opr2);
1804 
1805   Label L;
1806   if (unordered_is_less) {
1807     movl(dst, -1);
1808     jcc(Assembler::parity, L);
1809     jcc(Assembler::below , L);
1810     movl(dst, 0);
1811     jcc(Assembler::equal , L);
1812     increment(dst);
1813   } else { // unordered is greater
1814     movl(dst, 1);
1815     jcc(Assembler::parity, L);
1816     jcc(Assembler::above , L);
1817     movl(dst, 0);
1818     jcc(Assembler::equal , L);
1819     decrementl(dst);
1820   }
1821   bind(L);
1822 }
1823 
1824 
1825 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1826   assert(rscratch != noreg || always_reachable(src1), "missing");
1827 
1828   if (reachable(src1)) {
1829     cmpb(as_Address(src1), imm);
1830   } else {
1831     lea(rscratch, src1);
1832     cmpb(Address(rscratch, 0), imm);
1833   }
1834 }
1835 
1836 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1837 #ifdef _LP64
1838   assert(rscratch != noreg || always_reachable(src2), "missing");
1839 
1840   if (src2.is_lval()) {
1841     movptr(rscratch, src2);
1842     Assembler::cmpq(src1, rscratch);
1843   } else if (reachable(src2)) {
1844     cmpq(src1, as_Address(src2));
1845   } else {
1846     lea(rscratch, src2);
1847     Assembler::cmpq(src1, Address(rscratch, 0));
1848   }
1849 #else
1850   assert(rscratch == noreg, "not needed");
1851   if (src2.is_lval()) {
1852     cmp_literal32(src1, (int32_t)src2.target(), src2.rspec());
1853   } else {
1854     cmpl(src1, as_Address(src2));
1855   }
1856 #endif // _LP64
1857 }
1858 
1859 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1860   assert(src2.is_lval(), "not a mem-mem compare");
1861 #ifdef _LP64
1862   // moves src2's literal address
1863   movptr(rscratch, src2);
1864   Assembler::cmpq(src1, rscratch);
1865 #else
1866   assert(rscratch == noreg, "not needed");
1867   cmp_literal32(src1, (int32_t)src2.target(), src2.rspec());
1868 #endif // _LP64
1869 }
1870 
1871 void MacroAssembler::cmpoop(Register src1, Register src2) {
1872   cmpptr(src1, src2);
1873 }
1874 
1875 void MacroAssembler::cmpoop(Register src1, Address src2) {
1876   cmpptr(src1, src2);
1877 }
1878 
1879 #ifdef _LP64
1880 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1881   movoop(rscratch, src2);
1882   cmpptr(src1, rscratch);
1883 }
1884 #endif
1885 
1886 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1887   assert(rscratch != noreg || always_reachable(adr), "missing");
1888 
1889   if (reachable(adr)) {
1890     lock();
1891     cmpxchgptr(reg, as_Address(adr));
1892   } else {
1893     lea(rscratch, adr);
1894     lock();
1895     cmpxchgptr(reg, Address(rscratch, 0));
1896   }
1897 }
1898 
1899 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1900   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1901 }
1902 
1903 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1904   assert(rscratch != noreg || always_reachable(src), "missing");
1905 
1906   if (reachable(src)) {
1907     Assembler::comisd(dst, as_Address(src));
1908   } else {
1909     lea(rscratch, src);
1910     Assembler::comisd(dst, Address(rscratch, 0));
1911   }
1912 }
1913 
1914 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1915   assert(rscratch != noreg || always_reachable(src), "missing");
1916 
1917   if (reachable(src)) {
1918     Assembler::comiss(dst, as_Address(src));
1919   } else {
1920     lea(rscratch, src);
1921     Assembler::comiss(dst, Address(rscratch, 0));
1922   }
1923 }
1924 
1925 
1926 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1927   assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1928 
1929   Condition negated_cond = negate_condition(cond);
1930   Label L;
1931   jcc(negated_cond, L);
1932   pushf(); // Preserve flags
1933   atomic_incl(counter_addr, rscratch);
1934   popf();
1935   bind(L);
1936 }
1937 
1938 int MacroAssembler::corrected_idivl(Register reg) {
1939   // Full implementation of Java idiv and irem; checks for
1940   // special case as described in JVM spec., p.243 & p.271.
1941   // The function returns the (pc) offset of the idivl
1942   // instruction - may be needed for implicit exceptions.
1943   //
1944   //         normal case                           special case
1945   //
1946   // input : rax,: dividend                         min_int
1947   //         reg: divisor   (may not be rax,/rdx)   -1
1948   //
1949   // output: rax,: quotient  (= rax, idiv reg)       min_int
1950   //         rdx: remainder (= rax, irem reg)       0
1951   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1952   const int min_int = 0x80000000;
1953   Label normal_case, special_case;
1954 
1955   // check for special case
1956   cmpl(rax, min_int);
1957   jcc(Assembler::notEqual, normal_case);
1958   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1959   cmpl(reg, -1);
1960   jcc(Assembler::equal, special_case);
1961 
1962   // handle normal case
1963   bind(normal_case);
1964   cdql();
1965   int idivl_offset = offset();
1966   idivl(reg);
1967 
1968   // normal and special case exit
1969   bind(special_case);
1970 
1971   return idivl_offset;
1972 }
1973 
1974 
1975 
1976 void MacroAssembler::decrementl(Register reg, int value) {
1977   if (value == min_jint) {subl(reg, value) ; return; }
1978   if (value <  0) { incrementl(reg, -value); return; }
1979   if (value == 0) {                        ; return; }
1980   if (value == 1 && UseIncDec) { decl(reg) ; return; }
1981   /* else */      { subl(reg, value)       ; return; }
1982 }
1983 
1984 void MacroAssembler::decrementl(Address dst, int value) {
1985   if (value == min_jint) {subl(dst, value) ; return; }
1986   if (value <  0) { incrementl(dst, -value); return; }
1987   if (value == 0) {                        ; return; }
1988   if (value == 1 && UseIncDec) { decl(dst) ; return; }
1989   /* else */      { subl(dst, value)       ; return; }
1990 }
1991 
1992 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1993   assert(shift_value > 0, "illegal shift value");
1994   Label _is_positive;
1995   testl (reg, reg);
1996   jcc (Assembler::positive, _is_positive);
1997   int offset = (1 << shift_value) - 1 ;
1998 
1999   if (offset == 1) {
2000     incrementl(reg);
2001   } else {
2002     addl(reg, offset);
2003   }
2004 
2005   bind (_is_positive);
2006   sarl(reg, shift_value);
2007 }
2008 
2009 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2010   assert(rscratch != noreg || always_reachable(src), "missing");
2011 
2012   if (reachable(src)) {
2013     Assembler::divsd(dst, as_Address(src));
2014   } else {
2015     lea(rscratch, src);
2016     Assembler::divsd(dst, Address(rscratch, 0));
2017   }
2018 }
2019 
2020 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2021   assert(rscratch != noreg || always_reachable(src), "missing");
2022 
2023   if (reachable(src)) {
2024     Assembler::divss(dst, as_Address(src));
2025   } else {
2026     lea(rscratch, src);
2027     Assembler::divss(dst, Address(rscratch, 0));
2028   }
2029 }
2030 
2031 void MacroAssembler::enter() {
2032   push(rbp);
2033   mov(rbp, rsp);
2034 }
2035 
2036 void MacroAssembler::post_call_nop() {
2037   if (!Continuations::enabled()) {
2038     return;
2039   }
2040   InstructionMark im(this);
2041   relocate(post_call_nop_Relocation::spec());
2042   InlineSkippedInstructionsCounter skipCounter(this);
2043   emit_int8((uint8_t)0x0f);
2044   emit_int8((uint8_t)0x1f);
2045   emit_int8((uint8_t)0x84);
2046   emit_int8((uint8_t)0x00);
2047   emit_int32(0x00);
2048 }
2049 
2050 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2051 void MacroAssembler::fat_nop() {
2052   if (UseAddressNop) {
2053     addr_nop_5();
2054   } else {
2055     emit_int8((uint8_t)0x26); // es:
2056     emit_int8((uint8_t)0x2e); // cs:
2057     emit_int8((uint8_t)0x64); // fs:
2058     emit_int8((uint8_t)0x65); // gs:
2059     emit_int8((uint8_t)0x90);
2060   }
2061 }
2062 
2063 #ifndef _LP64
2064 void MacroAssembler::fcmp(Register tmp) {
2065   fcmp(tmp, 1, true, true);
2066 }
2067 
2068 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2069   assert(!pop_right || pop_left, "usage error");
2070   if (VM_Version::supports_cmov()) {
2071     assert(tmp == noreg, "unneeded temp");
2072     if (pop_left) {
2073       fucomip(index);
2074     } else {
2075       fucomi(index);
2076     }
2077     if (pop_right) {
2078       fpop();
2079     }
2080   } else {
2081     assert(tmp != noreg, "need temp");
2082     if (pop_left) {
2083       if (pop_right) {
2084         fcompp();
2085       } else {
2086         fcomp(index);
2087       }
2088     } else {
2089       fcom(index);
2090     }
2091     // convert FPU condition into eflags condition via rax,
2092     save_rax(tmp);
2093     fwait(); fnstsw_ax();
2094     sahf();
2095     restore_rax(tmp);
2096   }
2097   // condition codes set as follows:
2098   //
2099   // CF (corresponds to C0) if x < y
2100   // PF (corresponds to C2) if unordered
2101   // ZF (corresponds to C3) if x = y
2102 }
2103 
2104 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2105   fcmp2int(dst, unordered_is_less, 1, true, true);
2106 }
2107 
2108 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2109   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2110   Label L;
2111   if (unordered_is_less) {
2112     movl(dst, -1);
2113     jcc(Assembler::parity, L);
2114     jcc(Assembler::below , L);
2115     movl(dst, 0);
2116     jcc(Assembler::equal , L);
2117     increment(dst);
2118   } else { // unordered is greater
2119     movl(dst, 1);
2120     jcc(Assembler::parity, L);
2121     jcc(Assembler::above , L);
2122     movl(dst, 0);
2123     jcc(Assembler::equal , L);
2124     decrementl(dst);
2125   }
2126   bind(L);
2127 }
2128 
2129 void MacroAssembler::fld_d(AddressLiteral src) {
2130   fld_d(as_Address(src));
2131 }
2132 
2133 void MacroAssembler::fld_s(AddressLiteral src) {
2134   fld_s(as_Address(src));
2135 }
2136 
2137 void MacroAssembler::fldcw(AddressLiteral src) {
2138   fldcw(as_Address(src));
2139 }
2140 
2141 void MacroAssembler::fpop() {
2142   ffree();
2143   fincstp();
2144 }
2145 
2146 void MacroAssembler::fremr(Register tmp) {
2147   save_rax(tmp);
2148   { Label L;
2149     bind(L);
2150     fprem();
2151     fwait(); fnstsw_ax();
2152     sahf();
2153     jcc(Assembler::parity, L);
2154   }
2155   restore_rax(tmp);
2156   // Result is in ST0.
2157   // Note: fxch & fpop to get rid of ST1
2158   // (otherwise FPU stack could overflow eventually)
2159   fxch(1);
2160   fpop();
2161 }
2162 
2163 void MacroAssembler::empty_FPU_stack() {
2164   if (VM_Version::supports_mmx()) {
2165     emms();
2166   } else {
2167     for (int i = 8; i-- > 0; ) ffree(i);
2168   }
2169 }
2170 #endif // !LP64
2171 
2172 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2173   assert(rscratch != noreg || always_reachable(src), "missing");
2174   if (reachable(src)) {
2175     Assembler::mulpd(dst, as_Address(src));
2176   } else {
2177     lea(rscratch, src);
2178     Assembler::mulpd(dst, Address(rscratch, 0));
2179   }
2180 }
2181 
2182 void MacroAssembler::load_float(Address src) {
2183 #ifdef _LP64
2184   movflt(xmm0, src);
2185 #else
2186   if (UseSSE >= 1) {
2187     movflt(xmm0, src);
2188   } else {
2189     fld_s(src);
2190   }
2191 #endif // LP64
2192 }
2193 
2194 void MacroAssembler::store_float(Address dst) {
2195 #ifdef _LP64
2196   movflt(dst, xmm0);
2197 #else
2198   if (UseSSE >= 1) {
2199     movflt(dst, xmm0);
2200   } else {
2201     fstp_s(dst);
2202   }
2203 #endif // LP64
2204 }
2205 
2206 void MacroAssembler::load_double(Address src) {
2207 #ifdef _LP64
2208   movdbl(xmm0, src);
2209 #else
2210   if (UseSSE >= 2) {
2211     movdbl(xmm0, src);
2212   } else {
2213     fld_d(src);
2214   }
2215 #endif // LP64
2216 }
2217 
2218 void MacroAssembler::store_double(Address dst) {
2219 #ifdef _LP64
2220   movdbl(dst, xmm0);
2221 #else
2222   if (UseSSE >= 2) {
2223     movdbl(dst, xmm0);
2224   } else {
2225     fstp_d(dst);
2226   }
2227 #endif // LP64
2228 }
2229 
2230 // dst = c = a * b + c
2231 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2232   Assembler::vfmadd231sd(c, a, b);
2233   if (dst != c) {
2234     movdbl(dst, c);
2235   }
2236 }
2237 
2238 // dst = c = a * b + c
2239 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2240   Assembler::vfmadd231ss(c, a, b);
2241   if (dst != c) {
2242     movflt(dst, c);
2243   }
2244 }
2245 
2246 // dst = c = a * b + c
2247 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2248   Assembler::vfmadd231pd(c, a, b, vector_len);
2249   if (dst != c) {
2250     vmovdqu(dst, c);
2251   }
2252 }
2253 
2254 // dst = c = a * b + c
2255 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2256   Assembler::vfmadd231ps(c, a, b, vector_len);
2257   if (dst != c) {
2258     vmovdqu(dst, c);
2259   }
2260 }
2261 
2262 // dst = c = a * b + c
2263 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2264   Assembler::vfmadd231pd(c, a, b, vector_len);
2265   if (dst != c) {
2266     vmovdqu(dst, c);
2267   }
2268 }
2269 
2270 // dst = c = a * b + c
2271 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2272   Assembler::vfmadd231ps(c, a, b, vector_len);
2273   if (dst != c) {
2274     vmovdqu(dst, c);
2275   }
2276 }
2277 
2278 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
2279   assert(rscratch != noreg || always_reachable(dst), "missing");
2280 
2281   if (reachable(dst)) {
2282     incrementl(as_Address(dst));
2283   } else {
2284     lea(rscratch, dst);
2285     incrementl(Address(rscratch, 0));
2286   }
2287 }
2288 
2289 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
2290   incrementl(as_Address(dst, rscratch));
2291 }
2292 
2293 void MacroAssembler::incrementl(Register reg, int value) {
2294   if (value == min_jint) {addl(reg, value) ; return; }
2295   if (value <  0) { decrementl(reg, -value); return; }
2296   if (value == 0) {                        ; return; }
2297   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2298   /* else */      { addl(reg, value)       ; return; }
2299 }
2300 
2301 void MacroAssembler::incrementl(Address dst, int value) {
2302   if (value == min_jint) {addl(dst, value) ; return; }
2303   if (value <  0) { decrementl(dst, -value); return; }
2304   if (value == 0) {                        ; return; }
2305   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2306   /* else */      { addl(dst, value)       ; return; }
2307 }
2308 
2309 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
2310   assert(rscratch != noreg || always_reachable(dst), "missing");
2311 
2312   if (reachable(dst)) {
2313     jmp_literal(dst.target(), dst.rspec());
2314   } else {
2315     lea(rscratch, dst);
2316     jmp(rscratch);
2317   }
2318 }
2319 
2320 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
2321   assert(rscratch != noreg || always_reachable(dst), "missing");
2322 
2323   if (reachable(dst)) {
2324     InstructionMark im(this);
2325     relocate(dst.reloc());
2326     const int short_size = 2;
2327     const int long_size = 6;
2328     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2329     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2330       // 0111 tttn #8-bit disp
2331       emit_int8(0x70 | cc);
2332       emit_int8((offs - short_size) & 0xFF);
2333     } else {
2334       // 0000 1111 1000 tttn #32-bit disp
2335       emit_int8(0x0F);
2336       emit_int8((unsigned char)(0x80 | cc));
2337       emit_int32(offs - long_size);
2338     }
2339   } else {
2340 #ifdef ASSERT
2341     warning("reversing conditional branch");
2342 #endif /* ASSERT */
2343     Label skip;
2344     jccb(reverse[cc], skip);
2345     lea(rscratch, dst);
2346     Assembler::jmp(rscratch);
2347     bind(skip);
2348   }
2349 }
2350 
2351 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
2352   assert(rscratch != noreg || always_reachable(src), "missing");
2353 
2354   if (reachable(src)) {
2355     Assembler::ldmxcsr(as_Address(src));
2356   } else {
2357     lea(rscratch, src);
2358     Assembler::ldmxcsr(Address(rscratch, 0));
2359   }
2360 }
2361 
2362 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2363   int off;
2364   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2365     off = offset();
2366     movsbl(dst, src); // movsxb
2367   } else {
2368     off = load_unsigned_byte(dst, src);
2369     shll(dst, 24);
2370     sarl(dst, 24);
2371   }
2372   return off;
2373 }
2374 
2375 // Note: load_signed_short used to be called load_signed_word.
2376 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2377 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2378 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2379 int MacroAssembler::load_signed_short(Register dst, Address src) {
2380   int off;
2381   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2382     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2383     // version but this is what 64bit has always done. This seems to imply
2384     // that users are only using 32bits worth.
2385     off = offset();
2386     movswl(dst, src); // movsxw
2387   } else {
2388     off = load_unsigned_short(dst, src);
2389     shll(dst, 16);
2390     sarl(dst, 16);
2391   }
2392   return off;
2393 }
2394 
2395 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2396   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2397   // and "3.9 Partial Register Penalties", p. 22).
2398   int off;
2399   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2400     off = offset();
2401     movzbl(dst, src); // movzxb
2402   } else {
2403     xorl(dst, dst);
2404     off = offset();
2405     movb(dst, src);
2406   }
2407   return off;
2408 }
2409 
2410 // Note: load_unsigned_short used to be called load_unsigned_word.
2411 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2412   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2413   // and "3.9 Partial Register Penalties", p. 22).
2414   int off;
2415   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2416     off = offset();
2417     movzwl(dst, src); // movzxw
2418   } else {
2419     xorl(dst, dst);
2420     off = offset();
2421     movw(dst, src);
2422   }
2423   return off;
2424 }
2425 
2426 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2427   switch (size_in_bytes) {
2428 #ifndef _LP64
2429   case  8:
2430     assert(dst2 != noreg, "second dest register required");
2431     movl(dst,  src);
2432     movl(dst2, src.plus_disp(BytesPerInt));
2433     break;
2434 #else
2435   case  8:  movq(dst, src); break;
2436 #endif
2437   case  4:  movl(dst, src); break;
2438   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2439   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2440   default:  ShouldNotReachHere();
2441   }
2442 }
2443 
2444 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2445   switch (size_in_bytes) {
2446 #ifndef _LP64
2447   case  8:
2448     assert(src2 != noreg, "second source register required");
2449     movl(dst,                        src);
2450     movl(dst.plus_disp(BytesPerInt), src2);
2451     break;
2452 #else
2453   case  8:  movq(dst, src); break;
2454 #endif
2455   case  4:  movl(dst, src); break;
2456   case  2:  movw(dst, src); break;
2457   case  1:  movb(dst, src); break;
2458   default:  ShouldNotReachHere();
2459   }
2460 }
2461 
2462 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
2463   assert(rscratch != noreg || always_reachable(dst), "missing");
2464 
2465   if (reachable(dst)) {
2466     movl(as_Address(dst), src);
2467   } else {
2468     lea(rscratch, dst);
2469     movl(Address(rscratch, 0), src);
2470   }
2471 }
2472 
2473 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2474   if (reachable(src)) {
2475     movl(dst, as_Address(src));
2476   } else {
2477     lea(dst, src);
2478     movl(dst, Address(dst, 0));
2479   }
2480 }
2481 
2482 // C++ bool manipulation
2483 
2484 void MacroAssembler::movbool(Register dst, Address src) {
2485   if(sizeof(bool) == 1)
2486     movb(dst, src);
2487   else if(sizeof(bool) == 2)
2488     movw(dst, src);
2489   else if(sizeof(bool) == 4)
2490     movl(dst, src);
2491   else
2492     // unsupported
2493     ShouldNotReachHere();
2494 }
2495 
2496 void MacroAssembler::movbool(Address dst, bool boolconst) {
2497   if(sizeof(bool) == 1)
2498     movb(dst, (int) boolconst);
2499   else if(sizeof(bool) == 2)
2500     movw(dst, (int) boolconst);
2501   else if(sizeof(bool) == 4)
2502     movl(dst, (int) boolconst);
2503   else
2504     // unsupported
2505     ShouldNotReachHere();
2506 }
2507 
2508 void MacroAssembler::movbool(Address dst, Register src) {
2509   if(sizeof(bool) == 1)
2510     movb(dst, src);
2511   else if(sizeof(bool) == 2)
2512     movw(dst, src);
2513   else if(sizeof(bool) == 4)
2514     movl(dst, src);
2515   else
2516     // unsupported
2517     ShouldNotReachHere();
2518 }
2519 
2520 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
2521   assert(rscratch != noreg || always_reachable(src), "missing");
2522 
2523   if (reachable(src)) {
2524     movdl(dst, as_Address(src));
2525   } else {
2526     lea(rscratch, src);
2527     movdl(dst, Address(rscratch, 0));
2528   }
2529 }
2530 
2531 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
2532   assert(rscratch != noreg || always_reachable(src), "missing");
2533 
2534   if (reachable(src)) {
2535     movq(dst, as_Address(src));
2536   } else {
2537     lea(rscratch, src);
2538     movq(dst, Address(rscratch, 0));
2539   }
2540 }
2541 
2542 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
2543   assert(rscratch != noreg || always_reachable(src), "missing");
2544 
2545   if (reachable(src)) {
2546     if (UseXmmLoadAndClearUpper) {
2547       movsd (dst, as_Address(src));
2548     } else {
2549       movlpd(dst, as_Address(src));
2550     }
2551   } else {
2552     lea(rscratch, src);
2553     if (UseXmmLoadAndClearUpper) {
2554       movsd (dst, Address(rscratch, 0));
2555     } else {
2556       movlpd(dst, Address(rscratch, 0));
2557     }
2558   }
2559 }
2560 
2561 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
2562   assert(rscratch != noreg || always_reachable(src), "missing");
2563 
2564   if (reachable(src)) {
2565     movss(dst, as_Address(src));
2566   } else {
2567     lea(rscratch, src);
2568     movss(dst, Address(rscratch, 0));
2569   }
2570 }
2571 
2572 void MacroAssembler::movptr(Register dst, Register src) {
2573   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2574 }
2575 
2576 void MacroAssembler::movptr(Register dst, Address src) {
2577   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2578 }
2579 
2580 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2581 void MacroAssembler::movptr(Register dst, intptr_t src) {
2582 #ifdef _LP64
2583   if (is_simm32(src)) {
2584     movq(dst, checked_cast<int32_t>(src));
2585   } else {
2586     mov64(dst, src);
2587   }
2588 #else
2589   movl(dst, src);
2590 #endif
2591 }
2592 
2593 void MacroAssembler::movptr(Address dst, Register src) {
2594   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2595 }
2596 
2597 void MacroAssembler::movptr(Address dst, int32_t src) {
2598   LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src));
2599 }
2600 
2601 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2602   assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2603   Assembler::movdqu(dst, src);
2604 }
2605 
2606 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2607   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2608   Assembler::movdqu(dst, src);
2609 }
2610 
2611 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2612   assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2613   Assembler::movdqu(dst, src);
2614 }
2615 
2616 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2617   assert(rscratch != noreg || always_reachable(src), "missing");
2618 
2619   if (reachable(src)) {
2620     movdqu(dst, as_Address(src));
2621   } else {
2622     lea(rscratch, src);
2623     movdqu(dst, Address(rscratch, 0));
2624   }
2625 }
2626 
2627 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2628   assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2629   Assembler::vmovdqu(dst, src);
2630 }
2631 
2632 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2633   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2634   Assembler::vmovdqu(dst, src);
2635 }
2636 
2637 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2638   assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2639   Assembler::vmovdqu(dst, src);
2640 }
2641 
2642 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2643   assert(rscratch != noreg || always_reachable(src), "missing");
2644 
2645   if (reachable(src)) {
2646     vmovdqu(dst, as_Address(src));
2647   }
2648   else {
2649     lea(rscratch, src);
2650     vmovdqu(dst, Address(rscratch, 0));
2651   }
2652 }
2653 
2654 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2655   assert(rscratch != noreg || always_reachable(src), "missing");
2656 
2657   if (vector_len == AVX_512bit) {
2658     evmovdquq(dst, src, AVX_512bit, rscratch);
2659   } else if (vector_len == AVX_256bit) {
2660     vmovdqu(dst, src, rscratch);
2661   } else {
2662     movdqu(dst, src, rscratch);
2663   }
2664 }
2665 
2666 void MacroAssembler::kmov(KRegister dst, Address src) {
2667   if (VM_Version::supports_avx512bw()) {
2668     kmovql(dst, src);
2669   } else {
2670     assert(VM_Version::supports_evex(), "");
2671     kmovwl(dst, src);
2672   }
2673 }
2674 
2675 void MacroAssembler::kmov(Address dst, KRegister src) {
2676   if (VM_Version::supports_avx512bw()) {
2677     kmovql(dst, src);
2678   } else {
2679     assert(VM_Version::supports_evex(), "");
2680     kmovwl(dst, src);
2681   }
2682 }
2683 
2684 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2685   if (VM_Version::supports_avx512bw()) {
2686     kmovql(dst, src);
2687   } else {
2688     assert(VM_Version::supports_evex(), "");
2689     kmovwl(dst, src);
2690   }
2691 }
2692 
2693 void MacroAssembler::kmov(Register dst, KRegister src) {
2694   if (VM_Version::supports_avx512bw()) {
2695     kmovql(dst, src);
2696   } else {
2697     assert(VM_Version::supports_evex(), "");
2698     kmovwl(dst, src);
2699   }
2700 }
2701 
2702 void MacroAssembler::kmov(KRegister dst, Register src) {
2703   if (VM_Version::supports_avx512bw()) {
2704     kmovql(dst, src);
2705   } else {
2706     assert(VM_Version::supports_evex(), "");
2707     kmovwl(dst, src);
2708   }
2709 }
2710 
2711 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2712   assert(rscratch != noreg || always_reachable(src), "missing");
2713 
2714   if (reachable(src)) {
2715     kmovql(dst, as_Address(src));
2716   } else {
2717     lea(rscratch, src);
2718     kmovql(dst, Address(rscratch, 0));
2719   }
2720 }
2721 
2722 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2723   assert(rscratch != noreg || always_reachable(src), "missing");
2724 
2725   if (reachable(src)) {
2726     kmovwl(dst, as_Address(src));
2727   } else {
2728     lea(rscratch, src);
2729     kmovwl(dst, Address(rscratch, 0));
2730   }
2731 }
2732 
2733 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2734                                int vector_len, Register rscratch) {
2735   assert(rscratch != noreg || always_reachable(src), "missing");
2736 
2737   if (reachable(src)) {
2738     Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2739   } else {
2740     lea(rscratch, src);
2741     Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2742   }
2743 }
2744 
2745 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2746                                int vector_len, Register rscratch) {
2747   assert(rscratch != noreg || always_reachable(src), "missing");
2748 
2749   if (reachable(src)) {
2750     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2751   } else {
2752     lea(rscratch, src);
2753     Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2754   }
2755 }
2756 
2757 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2758   assert(rscratch != noreg || always_reachable(src), "missing");
2759 
2760   if (reachable(src)) {
2761     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2762   } else {
2763     lea(rscratch, src);
2764     Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2765   }
2766 }
2767 
2768 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2769   assert(rscratch != noreg || always_reachable(src), "missing");
2770 
2771   if (reachable(src)) {
2772     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2773   } else {
2774     lea(rscratch, src);
2775     Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2776   }
2777 }
2778 
2779 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2780   assert(rscratch != noreg || always_reachable(src), "missing");
2781 
2782   if (reachable(src)) {
2783     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2784   } else {
2785     lea(rscratch, src);
2786     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2787   }
2788 }
2789 
2790 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2791   assert(rscratch != noreg || always_reachable(src), "missing");
2792 
2793   if (reachable(src)) {
2794     Assembler::movdqa(dst, as_Address(src));
2795   } else {
2796     lea(rscratch, src);
2797     Assembler::movdqa(dst, Address(rscratch, 0));
2798   }
2799 }
2800 
2801 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2802   assert(rscratch != noreg || always_reachable(src), "missing");
2803 
2804   if (reachable(src)) {
2805     Assembler::movsd(dst, as_Address(src));
2806   } else {
2807     lea(rscratch, src);
2808     Assembler::movsd(dst, Address(rscratch, 0));
2809   }
2810 }
2811 
2812 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2813   assert(rscratch != noreg || always_reachable(src), "missing");
2814 
2815   if (reachable(src)) {
2816     Assembler::movss(dst, as_Address(src));
2817   } else {
2818     lea(rscratch, src);
2819     Assembler::movss(dst, Address(rscratch, 0));
2820   }
2821 }
2822 
2823 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2824   assert(rscratch != noreg || always_reachable(src), "missing");
2825 
2826   if (reachable(src)) {
2827     Assembler::movddup(dst, as_Address(src));
2828   } else {
2829     lea(rscratch, src);
2830     Assembler::movddup(dst, Address(rscratch, 0));
2831   }
2832 }
2833 
2834 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2835   assert(rscratch != noreg || always_reachable(src), "missing");
2836 
2837   if (reachable(src)) {
2838     Assembler::vmovddup(dst, as_Address(src), vector_len);
2839   } else {
2840     lea(rscratch, src);
2841     Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2842   }
2843 }
2844 
2845 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2846   assert(rscratch != noreg || always_reachable(src), "missing");
2847 
2848   if (reachable(src)) {
2849     Assembler::mulsd(dst, as_Address(src));
2850   } else {
2851     lea(rscratch, src);
2852     Assembler::mulsd(dst, Address(rscratch, 0));
2853   }
2854 }
2855 
2856 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2857   assert(rscratch != noreg || always_reachable(src), "missing");
2858 
2859   if (reachable(src)) {
2860     Assembler::mulss(dst, as_Address(src));
2861   } else {
2862     lea(rscratch, src);
2863     Assembler::mulss(dst, Address(rscratch, 0));
2864   }
2865 }
2866 
2867 void MacroAssembler::null_check(Register reg, int offset) {
2868   if (needs_explicit_null_check(offset)) {
2869     // provoke OS null exception if reg is null by
2870     // accessing M[reg] w/o changing any (non-CC) registers
2871     // NOTE: cmpl is plenty here to provoke a segv
2872     cmpptr(rax, Address(reg, 0));
2873     // Note: should probably use testl(rax, Address(reg, 0));
2874     //       may be shorter code (however, this version of
2875     //       testl needs to be implemented first)
2876   } else {
2877     // nothing to do, (later) access of M[reg + offset]
2878     // will provoke OS null exception if reg is null
2879   }
2880 }
2881 
2882 void MacroAssembler::os_breakpoint() {
2883   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2884   // (e.g., MSVC can't call ps() otherwise)
2885   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2886 }
2887 
2888 void MacroAssembler::unimplemented(const char* what) {
2889   const char* buf = nullptr;
2890   {
2891     ResourceMark rm;
2892     stringStream ss;
2893     ss.print("unimplemented: %s", what);
2894     buf = code_string(ss.as_string());
2895   }
2896   stop(buf);
2897 }
2898 
2899 #ifdef _LP64
2900 #define XSTATE_BV 0x200
2901 #endif
2902 
2903 void MacroAssembler::pop_CPU_state() {
2904   pop_FPU_state();
2905   pop_IU_state();
2906 }
2907 
2908 void MacroAssembler::pop_FPU_state() {
2909 #ifndef _LP64
2910   frstor(Address(rsp, 0));
2911 #else
2912   fxrstor(Address(rsp, 0));
2913 #endif
2914   addptr(rsp, FPUStateSizeInWords * wordSize);
2915 }
2916 
2917 void MacroAssembler::pop_IU_state() {
2918   popa();
2919   LP64_ONLY(addq(rsp, 8));
2920   popf();
2921 }
2922 
2923 // Save Integer and Float state
2924 // Warning: Stack must be 16 byte aligned (64bit)
2925 void MacroAssembler::push_CPU_state() {
2926   push_IU_state();
2927   push_FPU_state();
2928 }
2929 
2930 void MacroAssembler::push_FPU_state() {
2931   subptr(rsp, FPUStateSizeInWords * wordSize);
2932 #ifndef _LP64
2933   fnsave(Address(rsp, 0));
2934   fwait();
2935 #else
2936   fxsave(Address(rsp, 0));
2937 #endif // LP64
2938 }
2939 
2940 void MacroAssembler::push_IU_state() {
2941   // Push flags first because pusha kills them
2942   pushf();
2943   // Make sure rsp stays 16-byte aligned
2944   LP64_ONLY(subq(rsp, 8));
2945   pusha();
2946 }
2947 
2948 void MacroAssembler::push_cont_fastpath() {
2949   if (!Continuations::enabled()) return;
2950 
2951 #ifndef _LP64
2952   Register rthread = rax;
2953   Register rrealsp = rbx;
2954   push(rthread);
2955   push(rrealsp);
2956 
2957   get_thread(rthread);
2958 
2959   // The code below wants the original RSP.
2960   // Move it back after the pushes above.
2961   movptr(rrealsp, rsp);
2962   addptr(rrealsp, 2*wordSize);
2963 #else
2964   Register rthread = r15_thread;
2965   Register rrealsp = rsp;
2966 #endif
2967 
2968   Label done;
2969   cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
2970   jccb(Assembler::belowEqual, done);
2971   movptr(Address(rthread, JavaThread::cont_fastpath_offset()), rrealsp);
2972   bind(done);
2973 
2974 #ifndef _LP64
2975   pop(rrealsp);
2976   pop(rthread);
2977 #endif
2978 }
2979 
2980 void MacroAssembler::pop_cont_fastpath() {
2981   if (!Continuations::enabled()) return;
2982 
2983 #ifndef _LP64
2984   Register rthread = rax;
2985   Register rrealsp = rbx;
2986   push(rthread);
2987   push(rrealsp);
2988 
2989   get_thread(rthread);
2990 
2991   // The code below wants the original RSP.
2992   // Move it back after the pushes above.
2993   movptr(rrealsp, rsp);
2994   addptr(rrealsp, 2*wordSize);
2995 #else
2996   Register rthread = r15_thread;
2997   Register rrealsp = rsp;
2998 #endif
2999 
3000   Label done;
3001   cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
3002   jccb(Assembler::below, done);
3003   movptr(Address(rthread, JavaThread::cont_fastpath_offset()), 0);
3004   bind(done);
3005 
3006 #ifndef _LP64
3007   pop(rrealsp);
3008   pop(rthread);
3009 #endif
3010 }
3011 
3012 void MacroAssembler::inc_held_monitor_count() {
3013 #ifndef _LP64
3014   Register thread = rax;
3015   push(thread);
3016   get_thread(thread);
3017   incrementl(Address(thread, JavaThread::held_monitor_count_offset()));
3018   pop(thread);
3019 #else // LP64
3020   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
3021 #endif
3022 }
3023 
3024 void MacroAssembler::dec_held_monitor_count() {
3025 #ifndef _LP64
3026   Register thread = rax;
3027   push(thread);
3028   get_thread(thread);
3029   decrementl(Address(thread, JavaThread::held_monitor_count_offset()));
3030   pop(thread);
3031 #else // LP64
3032   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
3033 #endif
3034 }
3035 
3036 #ifdef ASSERT
3037 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
3038 #ifdef _LP64
3039   Label no_cont;
3040   movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
3041   testl(cont, cont);
3042   jcc(Assembler::zero, no_cont);
3043   stop(name);
3044   bind(no_cont);
3045 #else
3046   Unimplemented();
3047 #endif
3048 }
3049 #endif
3050 
3051 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3052   if (!java_thread->is_valid()) {
3053     java_thread = rdi;
3054     get_thread(java_thread);
3055   }
3056   // we must set sp to zero to clear frame
3057   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3058   // must clear fp, so that compiled frames are not confused; it is
3059   // possible that we need it only for debugging
3060   if (clear_fp) {
3061     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3062   }
3063   // Always clear the pc because it could have been set by make_walkable()
3064   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3065   vzeroupper();
3066 }
3067 
3068 void MacroAssembler::restore_rax(Register tmp) {
3069   if (tmp == noreg) pop(rax);
3070   else if (tmp != rax) mov(rax, tmp);
3071 }
3072 
3073 void MacroAssembler::round_to(Register reg, int modulus) {
3074   addptr(reg, modulus - 1);
3075   andptr(reg, -modulus);
3076 }
3077 
3078 void MacroAssembler::save_rax(Register tmp) {
3079   if (tmp == noreg) push(rax);
3080   else if (tmp != rax) mov(tmp, rax);
3081 }
3082 
3083 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
3084   if (at_return) {
3085     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
3086     // we may safely use rsp instead to perform the stack watermark check.
3087     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
3088     jcc(Assembler::above, slow_path);
3089     return;
3090   }
3091   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
3092   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3093 }
3094 
3095 // Calls to C land
3096 //
3097 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3098 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3099 // has to be reset to 0. This is required to allow proper stack traversal.
3100 void MacroAssembler::set_last_Java_frame(Register java_thread,
3101                                          Register last_java_sp,
3102                                          Register last_java_fp,
3103                                          address  last_java_pc,
3104                                          Register rscratch) {
3105   vzeroupper();
3106   // determine java_thread register
3107   if (!java_thread->is_valid()) {
3108     java_thread = rdi;
3109     get_thread(java_thread);
3110   }
3111   // determine last_java_sp register
3112   if (!last_java_sp->is_valid()) {
3113     last_java_sp = rsp;
3114   }
3115   // last_java_fp is optional
3116   if (last_java_fp->is_valid()) {
3117     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3118   }
3119   // last_java_pc is optional
3120   if (last_java_pc != nullptr) {
3121     Address java_pc(java_thread,
3122                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
3123     lea(java_pc, InternalAddress(last_java_pc), rscratch);
3124   }
3125   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3126 }
3127 
3128 void MacroAssembler::shlptr(Register dst, int imm8) {
3129   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3130 }
3131 
3132 void MacroAssembler::shrptr(Register dst, int imm8) {
3133   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3134 }
3135 
3136 void MacroAssembler::sign_extend_byte(Register reg) {
3137   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3138     movsbl(reg, reg); // movsxb
3139   } else {
3140     shll(reg, 24);
3141     sarl(reg, 24);
3142   }
3143 }
3144 
3145 void MacroAssembler::sign_extend_short(Register reg) {
3146   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3147     movswl(reg, reg); // movsxw
3148   } else {
3149     shll(reg, 16);
3150     sarl(reg, 16);
3151   }
3152 }
3153 
3154 void MacroAssembler::testl(Address dst, int32_t imm32) {
3155   if (imm32 >= 0 && is8bit(imm32)) {
3156     testb(dst, imm32);
3157   } else {
3158     Assembler::testl(dst, imm32);
3159   }
3160 }
3161 
3162 void MacroAssembler::testl(Register dst, int32_t imm32) {
3163   if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
3164     testb(dst, imm32);
3165   } else {
3166     Assembler::testl(dst, imm32);
3167   }
3168 }
3169 
3170 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3171   assert(always_reachable(src), "Address should be reachable");
3172   testl(dst, as_Address(src));
3173 }
3174 
3175 #ifdef _LP64
3176 
3177 void MacroAssembler::testq(Address dst, int32_t imm32) {
3178   if (imm32 >= 0) {
3179     testl(dst, imm32);
3180   } else {
3181     Assembler::testq(dst, imm32);
3182   }
3183 }
3184 
3185 void MacroAssembler::testq(Register dst, int32_t imm32) {
3186   if (imm32 >= 0) {
3187     testl(dst, imm32);
3188   } else {
3189     Assembler::testq(dst, imm32);
3190   }
3191 }
3192 
3193 #endif
3194 
3195 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3196   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3197   Assembler::pcmpeqb(dst, src);
3198 }
3199 
3200 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3201   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3202   Assembler::pcmpeqw(dst, src);
3203 }
3204 
3205 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3206   assert((dst->encoding() < 16),"XMM register should be 0-15");
3207   Assembler::pcmpestri(dst, src, imm8);
3208 }
3209 
3210 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3211   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3212   Assembler::pcmpestri(dst, src, imm8);
3213 }
3214 
3215 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3216   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3217   Assembler::pmovzxbw(dst, src);
3218 }
3219 
3220 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3221   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3222   Assembler::pmovzxbw(dst, src);
3223 }
3224 
3225 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3226   assert((src->encoding() < 16),"XMM register should be 0-15");
3227   Assembler::pmovmskb(dst, src);
3228 }
3229 
3230 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3231   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3232   Assembler::ptest(dst, src);
3233 }
3234 
3235 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3236   assert(rscratch != noreg || always_reachable(src), "missing");
3237 
3238   if (reachable(src)) {
3239     Assembler::sqrtss(dst, as_Address(src));
3240   } else {
3241     lea(rscratch, src);
3242     Assembler::sqrtss(dst, Address(rscratch, 0));
3243   }
3244 }
3245 
3246 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3247   assert(rscratch != noreg || always_reachable(src), "missing");
3248 
3249   if (reachable(src)) {
3250     Assembler::subsd(dst, as_Address(src));
3251   } else {
3252     lea(rscratch, src);
3253     Assembler::subsd(dst, Address(rscratch, 0));
3254   }
3255 }
3256 
3257 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
3258   assert(rscratch != noreg || always_reachable(src), "missing");
3259 
3260   if (reachable(src)) {
3261     Assembler::roundsd(dst, as_Address(src), rmode);
3262   } else {
3263     lea(rscratch, src);
3264     Assembler::roundsd(dst, Address(rscratch, 0), rmode);
3265   }
3266 }
3267 
3268 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3269   assert(rscratch != noreg || always_reachable(src), "missing");
3270 
3271   if (reachable(src)) {
3272     Assembler::subss(dst, as_Address(src));
3273   } else {
3274     lea(rscratch, src);
3275     Assembler::subss(dst, Address(rscratch, 0));
3276   }
3277 }
3278 
3279 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3280   assert(rscratch != noreg || always_reachable(src), "missing");
3281 
3282   if (reachable(src)) {
3283     Assembler::ucomisd(dst, as_Address(src));
3284   } else {
3285     lea(rscratch, src);
3286     Assembler::ucomisd(dst, Address(rscratch, 0));
3287   }
3288 }
3289 
3290 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3291   assert(rscratch != noreg || always_reachable(src), "missing");
3292 
3293   if (reachable(src)) {
3294     Assembler::ucomiss(dst, as_Address(src));
3295   } else {
3296     lea(rscratch, src);
3297     Assembler::ucomiss(dst, Address(rscratch, 0));
3298   }
3299 }
3300 
3301 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3302   assert(rscratch != noreg || always_reachable(src), "missing");
3303 
3304   // Used in sign-bit flipping with aligned address.
3305   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3306   if (reachable(src)) {
3307     Assembler::xorpd(dst, as_Address(src));
3308   } else {
3309     lea(rscratch, src);
3310     Assembler::xorpd(dst, Address(rscratch, 0));
3311   }
3312 }
3313 
3314 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3315   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3316     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3317   }
3318   else {
3319     Assembler::xorpd(dst, src);
3320   }
3321 }
3322 
3323 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3324   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3325     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3326   } else {
3327     Assembler::xorps(dst, src);
3328   }
3329 }
3330 
3331 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
3332   assert(rscratch != noreg || always_reachable(src), "missing");
3333 
3334   // Used in sign-bit flipping with aligned address.
3335   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3336   if (reachable(src)) {
3337     Assembler::xorps(dst, as_Address(src));
3338   } else {
3339     lea(rscratch, src);
3340     Assembler::xorps(dst, Address(rscratch, 0));
3341   }
3342 }
3343 
3344 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
3345   assert(rscratch != noreg || always_reachable(src), "missing");
3346 
3347   // Used in sign-bit flipping with aligned address.
3348   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3349   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3350   if (reachable(src)) {
3351     Assembler::pshufb(dst, as_Address(src));
3352   } else {
3353     lea(rscratch, src);
3354     Assembler::pshufb(dst, Address(rscratch, 0));
3355   }
3356 }
3357 
3358 // AVX 3-operands instructions
3359 
3360 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3361   assert(rscratch != noreg || always_reachable(src), "missing");
3362 
3363   if (reachable(src)) {
3364     vaddsd(dst, nds, as_Address(src));
3365   } else {
3366     lea(rscratch, src);
3367     vaddsd(dst, nds, Address(rscratch, 0));
3368   }
3369 }
3370 
3371 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3372   assert(rscratch != noreg || always_reachable(src), "missing");
3373 
3374   if (reachable(src)) {
3375     vaddss(dst, nds, as_Address(src));
3376   } else {
3377     lea(rscratch, src);
3378     vaddss(dst, nds, Address(rscratch, 0));
3379   }
3380 }
3381 
3382 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3383   assert(UseAVX > 0, "requires some form of AVX");
3384   assert(rscratch != noreg || always_reachable(src), "missing");
3385 
3386   if (reachable(src)) {
3387     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3388   } else {
3389     lea(rscratch, src);
3390     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3391   }
3392 }
3393 
3394 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3395   assert(UseAVX > 0, "requires some form of AVX");
3396   assert(rscratch != noreg || always_reachable(src), "missing");
3397 
3398   if (reachable(src)) {
3399     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3400   } else {
3401     lea(rscratch, src);
3402     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3403   }
3404 }
3405 
3406 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
3407   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3408   assert(rscratch != noreg || always_reachable(negate_field), "missing");
3409 
3410   vandps(dst, nds, negate_field, vector_len, rscratch);
3411 }
3412 
3413 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
3414   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3415   assert(rscratch != noreg || always_reachable(negate_field), "missing");
3416 
3417   vandpd(dst, nds, negate_field, vector_len, rscratch);
3418 }
3419 
3420 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3421   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3422   Assembler::vpaddb(dst, nds, src, vector_len);
3423 }
3424 
3425 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3426   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3427   Assembler::vpaddb(dst, nds, src, vector_len);
3428 }
3429 
3430 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3431   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3432   Assembler::vpaddw(dst, nds, src, vector_len);
3433 }
3434 
3435 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3436   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3437   Assembler::vpaddw(dst, nds, src, vector_len);
3438 }
3439 
3440 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3441   assert(rscratch != noreg || always_reachable(src), "missing");
3442 
3443   if (reachable(src)) {
3444     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3445   } else {
3446     lea(rscratch, src);
3447     Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
3448   }
3449 }
3450 
3451 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3452   assert(rscratch != noreg || always_reachable(src), "missing");
3453 
3454   if (reachable(src)) {
3455     Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
3456   } else {
3457     lea(rscratch, src);
3458     Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
3459   }
3460 }
3461 
3462 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3463   assert(rscratch != noreg || always_reachable(src), "missing");
3464 
3465   if (reachable(src)) {
3466     Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
3467   } else {
3468     lea(rscratch, src);
3469     Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
3470   }
3471 }
3472 
3473 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3474   assert(rscratch != noreg || always_reachable(src), "missing");
3475 
3476   if (reachable(src)) {
3477     Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3478   } else {
3479     lea(rscratch, src);
3480     Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3481   }
3482 }
3483 
3484 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3485   assert(rscratch != noreg || always_reachable(src), "missing");
3486 
3487   if (reachable(src)) {
3488     Assembler::vbroadcastss(dst, as_Address(src), vector_len);
3489   } else {
3490     lea(rscratch, src);
3491     Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
3492   }
3493 }
3494 
3495 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3496   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3497   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3498 }
3499 
3500 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3501   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3502   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3503 }
3504 
3505 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3506   assert(rscratch != noreg || always_reachable(src), "missing");
3507 
3508   if (reachable(src)) {
3509     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3510   } else {
3511     lea(rscratch, src);
3512     Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3513   }
3514 }
3515 
3516 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3517                              int comparison, bool is_signed, int vector_len, Register rscratch) {
3518   assert(rscratch != noreg || always_reachable(src), "missing");
3519 
3520   if (reachable(src)) {
3521     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3522   } else {
3523     lea(rscratch, src);
3524     Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3525   }
3526 }
3527 
3528 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3529                              int comparison, bool is_signed, int vector_len, Register rscratch) {
3530   assert(rscratch != noreg || always_reachable(src), "missing");
3531 
3532   if (reachable(src)) {
3533     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3534   } else {
3535     lea(rscratch, src);
3536     Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3537   }
3538 }
3539 
3540 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3541                              int comparison, bool is_signed, int vector_len, Register rscratch) {
3542   assert(rscratch != noreg || always_reachable(src), "missing");
3543 
3544   if (reachable(src)) {
3545     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3546   } else {
3547     lea(rscratch, src);
3548     Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3549   }
3550 }
3551 
3552 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3553                              int comparison, bool is_signed, int vector_len, Register rscratch) {
3554   assert(rscratch != noreg || always_reachable(src), "missing");
3555 
3556   if (reachable(src)) {
3557     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3558   } else {
3559     lea(rscratch, src);
3560     Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3561   }
3562 }
3563 
3564 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3565   if (width == Assembler::Q) {
3566     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3567   } else {
3568     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3569   }
3570 }
3571 
3572 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3573   int eq_cond_enc = 0x29;
3574   int gt_cond_enc = 0x37;
3575   if (width != Assembler::Q) {
3576     eq_cond_enc = 0x74 + width;
3577     gt_cond_enc = 0x64 + width;
3578   }
3579   switch (cond) {
3580   case eq:
3581     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3582     break;
3583   case neq:
3584     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3585     vallones(xtmp, vector_len);
3586     vpxor(dst, xtmp, dst, vector_len);
3587     break;
3588   case le:
3589     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3590     vallones(xtmp, vector_len);
3591     vpxor(dst, xtmp, dst, vector_len);
3592     break;
3593   case nlt:
3594     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3595     vallones(xtmp, vector_len);
3596     vpxor(dst, xtmp, dst, vector_len);
3597     break;
3598   case lt:
3599     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3600     break;
3601   case nle:
3602     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3603     break;
3604   default:
3605     assert(false, "Should not reach here");
3606   }
3607 }
3608 
3609 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3610   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3611   Assembler::vpmovzxbw(dst, src, vector_len);
3612 }
3613 
3614 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3615   assert((src->encoding() < 16),"XMM register should be 0-15");
3616   Assembler::vpmovmskb(dst, src, vector_len);
3617 }
3618 
3619 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3620   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3621   Assembler::vpmullw(dst, nds, src, vector_len);
3622 }
3623 
3624 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3625   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3626   Assembler::vpmullw(dst, nds, src, vector_len);
3627 }
3628 
3629 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3630   assert((UseAVX > 0), "AVX support is needed");
3631   assert(rscratch != noreg || always_reachable(src), "missing");
3632 
3633   if (reachable(src)) {
3634     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3635   } else {
3636     lea(rscratch, src);
3637     Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3638   }
3639 }
3640 
3641 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3642   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3643   Assembler::vpsubb(dst, nds, src, vector_len);
3644 }
3645 
3646 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3647   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3648   Assembler::vpsubb(dst, nds, src, vector_len);
3649 }
3650 
3651 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3652   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3653   Assembler::vpsubw(dst, nds, src, vector_len);
3654 }
3655 
3656 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3657   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3658   Assembler::vpsubw(dst, nds, src, vector_len);
3659 }
3660 
3661 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3662   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3663   Assembler::vpsraw(dst, nds, shift, vector_len);
3664 }
3665 
3666 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3667   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3668   Assembler::vpsraw(dst, nds, shift, vector_len);
3669 }
3670 
3671 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3672   assert(UseAVX > 2,"");
3673   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3674      vector_len = 2;
3675   }
3676   Assembler::evpsraq(dst, nds, shift, vector_len);
3677 }
3678 
3679 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3680   assert(UseAVX > 2,"");
3681   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3682      vector_len = 2;
3683   }
3684   Assembler::evpsraq(dst, nds, shift, vector_len);
3685 }
3686 
3687 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3688   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3689   Assembler::vpsrlw(dst, nds, shift, vector_len);
3690 }
3691 
3692 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3693   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3694   Assembler::vpsrlw(dst, nds, shift, vector_len);
3695 }
3696 
3697 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3698   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3699   Assembler::vpsllw(dst, nds, shift, vector_len);
3700 }
3701 
3702 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3703   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3704   Assembler::vpsllw(dst, nds, shift, vector_len);
3705 }
3706 
3707 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3708   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3709   Assembler::vptest(dst, src);
3710 }
3711 
3712 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3713   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3714   Assembler::punpcklbw(dst, src);
3715 }
3716 
3717 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3718   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3719   Assembler::pshufd(dst, src, mode);
3720 }
3721 
3722 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3723   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3724   Assembler::pshuflw(dst, src, mode);
3725 }
3726 
3727 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3728   assert(rscratch != noreg || always_reachable(src), "missing");
3729 
3730   if (reachable(src)) {
3731     vandpd(dst, nds, as_Address(src), vector_len);
3732   } else {
3733     lea(rscratch, src);
3734     vandpd(dst, nds, Address(rscratch, 0), vector_len);
3735   }
3736 }
3737 
3738 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3739   assert(rscratch != noreg || always_reachable(src), "missing");
3740 
3741   if (reachable(src)) {
3742     vandps(dst, nds, as_Address(src), vector_len);
3743   } else {
3744     lea(rscratch, src);
3745     vandps(dst, nds, Address(rscratch, 0), vector_len);
3746   }
3747 }
3748 
3749 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3750                             bool merge, int vector_len, Register rscratch) {
3751   assert(rscratch != noreg || always_reachable(src), "missing");
3752 
3753   if (reachable(src)) {
3754     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3755   } else {
3756     lea(rscratch, src);
3757     Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3758   }
3759 }
3760 
3761 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3762   assert(rscratch != noreg || always_reachable(src), "missing");
3763 
3764   if (reachable(src)) {
3765     vdivsd(dst, nds, as_Address(src));
3766   } else {
3767     lea(rscratch, src);
3768     vdivsd(dst, nds, Address(rscratch, 0));
3769   }
3770 }
3771 
3772 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3773   assert(rscratch != noreg || always_reachable(src), "missing");
3774 
3775   if (reachable(src)) {
3776     vdivss(dst, nds, as_Address(src));
3777   } else {
3778     lea(rscratch, src);
3779     vdivss(dst, nds, Address(rscratch, 0));
3780   }
3781 }
3782 
3783 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3784   assert(rscratch != noreg || always_reachable(src), "missing");
3785 
3786   if (reachable(src)) {
3787     vmulsd(dst, nds, as_Address(src));
3788   } else {
3789     lea(rscratch, src);
3790     vmulsd(dst, nds, Address(rscratch, 0));
3791   }
3792 }
3793 
3794 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3795   assert(rscratch != noreg || always_reachable(src), "missing");
3796 
3797   if (reachable(src)) {
3798     vmulss(dst, nds, as_Address(src));
3799   } else {
3800     lea(rscratch, src);
3801     vmulss(dst, nds, Address(rscratch, 0));
3802   }
3803 }
3804 
3805 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3806   assert(rscratch != noreg || always_reachable(src), "missing");
3807 
3808   if (reachable(src)) {
3809     vsubsd(dst, nds, as_Address(src));
3810   } else {
3811     lea(rscratch, src);
3812     vsubsd(dst, nds, Address(rscratch, 0));
3813   }
3814 }
3815 
3816 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3817   assert(rscratch != noreg || always_reachable(src), "missing");
3818 
3819   if (reachable(src)) {
3820     vsubss(dst, nds, as_Address(src));
3821   } else {
3822     lea(rscratch, src);
3823     vsubss(dst, nds, Address(rscratch, 0));
3824   }
3825 }
3826 
3827 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3828   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3829   assert(rscratch != noreg || always_reachable(src), "missing");
3830 
3831   vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3832 }
3833 
3834 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3835   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3836   assert(rscratch != noreg || always_reachable(src), "missing");
3837 
3838   vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3839 }
3840 
3841 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3842   assert(rscratch != noreg || always_reachable(src), "missing");
3843 
3844   if (reachable(src)) {
3845     vxorpd(dst, nds, as_Address(src), vector_len);
3846   } else {
3847     lea(rscratch, src);
3848     vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3849   }
3850 }
3851 
3852 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3853   assert(rscratch != noreg || always_reachable(src), "missing");
3854 
3855   if (reachable(src)) {
3856     vxorps(dst, nds, as_Address(src), vector_len);
3857   } else {
3858     lea(rscratch, src);
3859     vxorps(dst, nds, Address(rscratch, 0), vector_len);
3860   }
3861 }
3862 
3863 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3864   assert(rscratch != noreg || always_reachable(src), "missing");
3865 
3866   if (UseAVX > 1 || (vector_len < 1)) {
3867     if (reachable(src)) {
3868       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3869     } else {
3870       lea(rscratch, src);
3871       Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3872     }
3873   } else {
3874     MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3875   }
3876 }
3877 
3878 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3879   assert(rscratch != noreg || always_reachable(src), "missing");
3880 
3881   if (reachable(src)) {
3882     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3883   } else {
3884     lea(rscratch, src);
3885     Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3886   }
3887 }
3888 
3889 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3890   const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3891   STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3892   // The inverted mask is sign-extended
3893   andptr(possibly_non_local, inverted_mask);
3894 }
3895 
3896 void MacroAssembler::resolve_jobject(Register value,
3897                                      Register thread,
3898                                      Register tmp) {
3899   assert_different_registers(value, thread, tmp);
3900   Label done, tagged, weak_tagged;
3901   testptr(value, value);
3902   jcc(Assembler::zero, done);           // Use null as-is.
3903   testptr(value, JNIHandles::tag_mask); // Test for tag.
3904   jcc(Assembler::notZero, tagged);
3905 
3906   // Resolve local handle
3907   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp, thread);
3908   verify_oop(value);
3909   jmp(done);
3910 
3911   bind(tagged);
3912   testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3913   jcc(Assembler::notZero, weak_tagged);
3914 
3915   // Resolve global handle
3916   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread);
3917   verify_oop(value);
3918   jmp(done);
3919 
3920   bind(weak_tagged);
3921   // Resolve jweak.
3922   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3923                  value, Address(value, -JNIHandles::TypeTag::weak_global), tmp, thread);
3924   verify_oop(value);
3925 
3926   bind(done);
3927 }
3928 
3929 void MacroAssembler::resolve_global_jobject(Register value,
3930                                             Register thread,
3931                                             Register tmp) {
3932   assert_different_registers(value, thread, tmp);
3933   Label done;
3934 
3935   testptr(value, value);
3936   jcc(Assembler::zero, done);           // Use null as-is.
3937 
3938 #ifdef ASSERT
3939   {
3940     Label valid_global_tag;
3941     testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
3942     jcc(Assembler::notZero, valid_global_tag);
3943     stop("non global jobject using resolve_global_jobject");
3944     bind(valid_global_tag);
3945   }
3946 #endif
3947 
3948   // Resolve global handle
3949   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread);
3950   verify_oop(value);
3951 
3952   bind(done);
3953 }
3954 
3955 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3956   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3957 }
3958 
3959 // Force generation of a 4 byte immediate value even if it fits into 8bit
3960 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3961   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3962 }
3963 
3964 void MacroAssembler::subptr(Register dst, Register src) {
3965   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3966 }
3967 
3968 // C++ bool manipulation
3969 void MacroAssembler::testbool(Register dst) {
3970   if(sizeof(bool) == 1)
3971     testb(dst, 0xff);
3972   else if(sizeof(bool) == 2) {
3973     // testw implementation needed for two byte bools
3974     ShouldNotReachHere();
3975   } else if(sizeof(bool) == 4)
3976     testl(dst, dst);
3977   else
3978     // unsupported
3979     ShouldNotReachHere();
3980 }
3981 
3982 void MacroAssembler::testptr(Register dst, Register src) {
3983   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3984 }
3985 
3986 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3987 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3988                                    Register var_size_in_bytes,
3989                                    int con_size_in_bytes,
3990                                    Register t1,
3991                                    Register t2,
3992                                    Label& slow_case) {
3993   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3994   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3995 }
3996 
3997 RegSet MacroAssembler::call_clobbered_gp_registers() {
3998   RegSet regs;
3999 #ifdef _LP64
4000   regs += RegSet::of(rax, rcx, rdx);
4001 #ifndef WINDOWS
4002   regs += RegSet::of(rsi, rdi);
4003 #endif
4004   regs += RegSet::range(r8, r11);
4005 #else
4006   regs += RegSet::of(rax, rcx, rdx);
4007 #endif
4008   return regs;
4009 }
4010 
4011 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
4012   int num_xmm_registers = XMMRegister::available_xmm_registers();
4013 #if defined(WINDOWS) && defined(_LP64)
4014   XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
4015   if (num_xmm_registers > 16) {
4016      result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
4017   }
4018   return result;
4019 #else
4020   return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
4021 #endif
4022 }
4023 
4024 static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor
4025 
4026 #ifndef _LP64
4027 static bool use_x87_registers() { return UseSSE < 2; }
4028 #endif
4029 static bool use_xmm_registers() { return UseSSE >= 1; }
4030 
4031 // C1 only ever uses the first double/float of the XMM register.
4032 static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); }
4033 
4034 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
4035   if (UseSSE == 1) {
4036     masm->movflt(Address(rsp, offset), reg);
4037   } else {
4038     masm->movdbl(Address(rsp, offset), reg);
4039   }
4040 }
4041 
4042 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
4043   if (UseSSE == 1) {
4044     masm->movflt(reg, Address(rsp, offset));
4045   } else {
4046     masm->movdbl(reg, Address(rsp, offset));
4047   }
4048 }
4049 
4050 int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers, bool save_fpu,
4051                            int& gp_area_size, int& fp_area_size, int& xmm_area_size) {
4052 
4053   gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
4054                          StackAlignmentInBytes);
4055 #ifdef _LP64
4056   fp_area_size = 0;
4057 #else
4058   fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0;
4059 #endif
4060   xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0;
4061 
4062   return gp_area_size + fp_area_size + xmm_area_size;
4063 }
4064 
4065 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
4066   block_comment("push_call_clobbered_registers start");
4067   // Regular registers
4068   RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
4069 
4070   int gp_area_size;
4071   int fp_area_size;
4072   int xmm_area_size;
4073   int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
4074                                                gp_area_size, fp_area_size, xmm_area_size);
4075   subptr(rsp, total_save_size);
4076 
4077   push_set(gp_registers_to_push, 0);
4078 
4079 #ifndef _LP64
4080   if (save_fpu && use_x87_registers()) {
4081     fnsave(Address(rsp, gp_area_size));
4082     fwait();
4083   }
4084 #endif
4085   if (save_fpu && use_xmm_registers()) {
4086     push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
4087   }
4088 
4089   block_comment("push_call_clobbered_registers end");
4090 }
4091 
4092 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
4093   block_comment("pop_call_clobbered_registers start");
4094 
4095   RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
4096 
4097   int gp_area_size;
4098   int fp_area_size;
4099   int xmm_area_size;
4100   int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
4101                                                gp_area_size, fp_area_size, xmm_area_size);
4102 
4103   if (restore_fpu && use_xmm_registers()) {
4104     pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
4105   }
4106 #ifndef _LP64
4107   if (restore_fpu && use_x87_registers()) {
4108     frstor(Address(rsp, gp_area_size));
4109   }
4110 #endif
4111 
4112   pop_set(gp_registers_to_pop, 0);
4113 
4114   addptr(rsp, total_save_size);
4115 
4116   vzeroupper();
4117 
4118   block_comment("pop_call_clobbered_registers end");
4119 }
4120 
4121 void MacroAssembler::push_set(XMMRegSet set, int offset) {
4122   assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
4123   int spill_offset = offset;
4124 
4125   for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
4126     save_xmm_register(this, spill_offset, *it);
4127     spill_offset += xmm_save_size();
4128   }
4129 }
4130 
4131 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
4132   int restore_size = set.size() * xmm_save_size();
4133   assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
4134 
4135   int restore_offset = offset + restore_size - xmm_save_size();
4136 
4137   for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
4138     restore_xmm_register(this, restore_offset, *it);
4139     restore_offset -= xmm_save_size();
4140   }
4141 }
4142 
4143 void MacroAssembler::push_set(RegSet set, int offset) {
4144   int spill_offset;
4145   if (offset == -1) {
4146     int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4147     int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4148     subptr(rsp, aligned_size);
4149     spill_offset = 0;
4150   } else {
4151     spill_offset = offset;
4152   }
4153 
4154   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
4155     movptr(Address(rsp, spill_offset), *it);
4156     spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4157   }
4158 }
4159 
4160 void MacroAssembler::pop_set(RegSet set, int offset) {
4161 
4162   int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4163   int restore_size = set.size() * gp_reg_size;
4164   int aligned_size = align_up(restore_size, StackAlignmentInBytes);
4165 
4166   int restore_offset;
4167   if (offset == -1) {
4168     restore_offset = restore_size - gp_reg_size;
4169   } else {
4170     restore_offset = offset + restore_size - gp_reg_size;
4171   }
4172   for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
4173     movptr(*it, Address(rsp, restore_offset));
4174     restore_offset -= gp_reg_size;
4175   }
4176 
4177   if (offset == -1) {
4178     addptr(rsp, aligned_size);
4179   }
4180 }
4181 
4182 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
4183 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4184   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4185   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4186   Label done;
4187 
4188   testptr(length_in_bytes, length_in_bytes);
4189   jcc(Assembler::zero, done);
4190 
4191   // initialize topmost word, divide index by 2, check if odd and test if zero
4192   // note: for the remaining code to work, index must be a multiple of BytesPerWord
4193 #ifdef ASSERT
4194   {
4195     Label L;
4196     testptr(length_in_bytes, BytesPerWord - 1);
4197     jcc(Assembler::zero, L);
4198     stop("length must be a multiple of BytesPerWord");
4199     bind(L);
4200   }
4201 #endif
4202   Register index = length_in_bytes;
4203   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
4204   if (UseIncDec) {
4205     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
4206   } else {
4207     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
4208     shrptr(index, 1);
4209   }
4210 #ifndef _LP64
4211   // index could have not been a multiple of 8 (i.e., bit 2 was set)
4212   {
4213     Label even;
4214     // note: if index was a multiple of 8, then it cannot
4215     //       be 0 now otherwise it must have been 0 before
4216     //       => if it is even, we don't need to check for 0 again
4217     jcc(Assembler::carryClear, even);
4218     // clear topmost word (no jump would be needed if conditional assignment worked here)
4219     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4220     // index could be 0 now, must check again
4221     jcc(Assembler::zero, done);
4222     bind(even);
4223   }
4224 #endif // !_LP64
4225   // initialize remaining object fields: index is a multiple of 2 now
4226   {
4227     Label loop;
4228     bind(loop);
4229     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4230     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4231     decrement(index);
4232     jcc(Assembler::notZero, loop);
4233   }
4234 
4235   bind(done);
4236 }
4237 
4238 // Look up the method for a megamorphic invokeinterface call.
4239 // The target method is determined by <intf_klass, itable_index>.
4240 // The receiver klass is in recv_klass.
4241 // On success, the result will be in method_result, and execution falls through.
4242 // On failure, execution transfers to the given label.
4243 void MacroAssembler::lookup_interface_method(Register recv_klass,
4244                                              Register intf_klass,
4245                                              RegisterOrConstant itable_index,
4246                                              Register method_result,
4247                                              Register scan_temp,
4248                                              Label& L_no_such_interface,
4249                                              bool return_method) {
4250   assert_different_registers(recv_klass, intf_klass, scan_temp);
4251   assert_different_registers(method_result, intf_klass, scan_temp);
4252   assert(recv_klass != method_result || !return_method,
4253          "recv_klass can be destroyed when method isn't needed");
4254 
4255   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4256          "caller must use same register for non-constant itable index as for method");
4257 
4258   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4259   int vtable_base = in_bytes(Klass::vtable_start_offset());
4260   int itentry_off = in_bytes(itableMethodEntry::method_offset());
4261   int scan_step   = itableOffsetEntry::size() * wordSize;
4262   int vte_size    = vtableEntry::size_in_bytes();
4263   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4264   assert(vte_size == wordSize, "else adjust times_vte_scale");
4265 
4266   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4267 
4268   // %%% Could store the aligned, prescaled offset in the klassoop.
4269   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4270 
4271   if (return_method) {
4272     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4273     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4274     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4275   }
4276 
4277   // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
4278   //   if (scan->interface() == intf) {
4279   //     result = (klass + scan->offset() + itable_index);
4280   //   }
4281   // }
4282   Label search, found_method;
4283 
4284   for (int peel = 1; peel >= 0; peel--) {
4285     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
4286     cmpptr(intf_klass, method_result);
4287 
4288     if (peel) {
4289       jccb(Assembler::equal, found_method);
4290     } else {
4291       jccb(Assembler::notEqual, search);
4292       // (invert the test to fall through to found_method...)
4293     }
4294 
4295     if (!peel)  break;
4296 
4297     bind(search);
4298 
4299     // Check that the previous entry is non-null.  A null entry means that
4300     // the receiver class doesn't implement the interface, and wasn't the
4301     // same as when the caller was compiled.
4302     testptr(method_result, method_result);
4303     jcc(Assembler::zero, L_no_such_interface);
4304     addptr(scan_temp, scan_step);
4305   }
4306 
4307   bind(found_method);
4308 
4309   if (return_method) {
4310     // Got a hit.
4311     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
4312     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4313   }
4314 }
4315 
4316 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
4317 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICHolder
4318 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
4319 // The target method is determined by <holder_klass, itable_index>.
4320 // The receiver klass is in recv_klass.
4321 // On success, the result will be in method_result, and execution falls through.
4322 // On failure, execution transfers to the given label.
4323 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
4324                                                   Register holder_klass,
4325                                                   Register resolved_klass,
4326                                                   Register method_result,
4327                                                   Register scan_temp,
4328                                                   Register temp_reg2,
4329                                                   Register receiver,
4330                                                   int itable_index,
4331                                                   Label& L_no_such_interface) {
4332   assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
4333   Register temp_itbl_klass = method_result;
4334   Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
4335 
4336   int vtable_base = in_bytes(Klass::vtable_start_offset());
4337   int itentry_off = in_bytes(itableMethodEntry::method_offset());
4338   int scan_step = itableOffsetEntry::size() * wordSize;
4339   int vte_size = vtableEntry::size_in_bytes();
4340   int ioffset = in_bytes(itableOffsetEntry::interface_offset());
4341   int ooffset = in_bytes(itableOffsetEntry::offset_offset());
4342   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4343   assert(vte_size == wordSize, "adjust times_vte_scale");
4344 
4345   Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
4346 
4347   // temp_itbl_klass = recv_klass.itable[0]
4348   // scan_temp = &recv_klass.itable[0] + step
4349   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4350   movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
4351   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
4352   xorptr(temp_reg, temp_reg);
4353 
4354   // Initial checks:
4355   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
4356   //   - if (itable[0] == 0), no such interface
4357   //   - if (itable[0] == holder_klass), shortcut to "holder found"
4358   cmpptr(holder_klass, resolved_klass);
4359   jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
4360   testptr(temp_itbl_klass, temp_itbl_klass);
4361   jccb(Assembler::zero, L_no_such_interface);
4362   cmpptr(holder_klass, temp_itbl_klass);
4363   jccb(Assembler::equal, L_holder_found);
4364 
4365   // Loop: Look for holder_klass record in itable
4366   //   do {
4367   //     tmp = itable[index];
4368   //     index += step;
4369   //     if (tmp == holder_klass) {
4370   //       goto L_holder_found; // Found!
4371   //     }
4372   //   } while (tmp != 0);
4373   //   goto L_no_such_interface // Not found.
4374   Label L_scan_holder;
4375   bind(L_scan_holder);
4376     movptr(temp_itbl_klass, Address(scan_temp, 0));
4377     addptr(scan_temp, scan_step);
4378     cmpptr(holder_klass, temp_itbl_klass);
4379     jccb(Assembler::equal, L_holder_found);
4380     testptr(temp_itbl_klass, temp_itbl_klass);
4381     jccb(Assembler::notZero, L_scan_holder);
4382 
4383   jmpb(L_no_such_interface);
4384 
4385   // Loop: Look for resolved_class record in itable
4386   //   do {
4387   //     tmp = itable[index];
4388   //     index += step;
4389   //     if (tmp == holder_klass) {
4390   //        // Also check if we have met a holder klass
4391   //        holder_tmp = itable[index-step-ioffset];
4392   //     }
4393   //     if (tmp == resolved_klass) {
4394   //        goto L_resolved_found;  // Found!
4395   //     }
4396   //   } while (tmp != 0);
4397   //   goto L_no_such_interface // Not found.
4398   //
4399   Label L_loop_scan_resolved;
4400   bind(L_loop_scan_resolved);
4401     movptr(temp_itbl_klass, Address(scan_temp, 0));
4402     addptr(scan_temp, scan_step);
4403     bind(L_loop_scan_resolved_entry);
4404     cmpptr(holder_klass, temp_itbl_klass);
4405     cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4406     cmpptr(resolved_klass, temp_itbl_klass);
4407     jccb(Assembler::equal, L_resolved_found);
4408     testptr(temp_itbl_klass, temp_itbl_klass);
4409     jccb(Assembler::notZero, L_loop_scan_resolved);
4410 
4411   jmpb(L_no_such_interface);
4412 
4413   Label L_ready;
4414 
4415   // See if we already have a holder klass. If not, go and scan for it.
4416   bind(L_resolved_found);
4417   testptr(temp_reg, temp_reg);
4418   jccb(Assembler::zero, L_scan_holder);
4419   jmpb(L_ready);
4420 
4421   bind(L_holder_found);
4422   movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4423 
4424   // Finally, temp_reg contains holder_klass vtable offset
4425   bind(L_ready);
4426   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4427   if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
4428     load_klass(scan_temp, receiver, noreg);
4429     movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4430   } else {
4431     movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4432   }
4433 }
4434 
4435 
4436 // virtual method calling
4437 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4438                                            RegisterOrConstant vtable_index,
4439                                            Register method_result) {
4440   const ByteSize base = Klass::vtable_start_offset();
4441   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4442   Address vtable_entry_addr(recv_klass,
4443                             vtable_index, Address::times_ptr,
4444                             base + vtableEntry::method_offset());
4445   movptr(method_result, vtable_entry_addr);
4446 }
4447 
4448 
4449 void MacroAssembler::check_klass_subtype(Register sub_klass,
4450                            Register super_klass,
4451                            Register temp_reg,
4452                            Label& L_success) {
4453   Label L_failure;
4454   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, nullptr);
4455   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
4456   bind(L_failure);
4457 }
4458 
4459 
4460 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4461                                                    Register super_klass,
4462                                                    Register temp_reg,
4463                                                    Label* L_success,
4464                                                    Label* L_failure,
4465                                                    Label* L_slow_path,
4466                                         RegisterOrConstant super_check_offset) {
4467   assert_different_registers(sub_klass, super_klass, temp_reg);
4468   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4469   if (super_check_offset.is_register()) {
4470     assert_different_registers(sub_klass, super_klass,
4471                                super_check_offset.as_register());
4472   } else if (must_load_sco) {
4473     assert(temp_reg != noreg, "supply either a temp or a register offset");
4474   }
4475 
4476   Label L_fallthrough;
4477   int label_nulls = 0;
4478   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4479   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4480   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4481   assert(label_nulls <= 1, "at most one null in the batch");
4482 
4483   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4484   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4485   Address super_check_offset_addr(super_klass, sco_offset);
4486 
4487   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4488   // range of a jccb.  If this routine grows larger, reconsider at
4489   // least some of these.
4490 #define local_jcc(assembler_cond, label)                                \
4491   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4492   else                             jcc( assembler_cond, label) /*omit semi*/
4493 
4494   // Hacked jmp, which may only be used just before L_fallthrough.
4495 #define final_jmp(label)                                                \
4496   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4497   else                            jmp(label)                /*omit semi*/
4498 
4499   // If the pointers are equal, we are done (e.g., String[] elements).
4500   // This self-check enables sharing of secondary supertype arrays among
4501   // non-primary types such as array-of-interface.  Otherwise, each such
4502   // type would need its own customized SSA.
4503   // We move this check to the front of the fast path because many
4504   // type checks are in fact trivially successful in this manner,
4505   // so we get a nicely predicted branch right at the start of the check.
4506   cmpptr(sub_klass, super_klass);
4507   local_jcc(Assembler::equal, *L_success);
4508 
4509   // Check the supertype display:
4510   if (must_load_sco) {
4511     // Positive movl does right thing on LP64.
4512     movl(temp_reg, super_check_offset_addr);
4513     super_check_offset = RegisterOrConstant(temp_reg);
4514   }
4515   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4516   cmpptr(super_klass, super_check_addr); // load displayed supertype
4517 
4518   // This check has worked decisively for primary supers.
4519   // Secondary supers are sought in the super_cache ('super_cache_addr').
4520   // (Secondary supers are interfaces and very deeply nested subtypes.)
4521   // This works in the same check above because of a tricky aliasing
4522   // between the super_cache and the primary super display elements.
4523   // (The 'super_check_addr' can address either, as the case requires.)
4524   // Note that the cache is updated below if it does not help us find
4525   // what we need immediately.
4526   // So if it was a primary super, we can just fail immediately.
4527   // Otherwise, it's the slow path for us (no success at this point).
4528 
4529   if (super_check_offset.is_register()) {
4530     local_jcc(Assembler::equal, *L_success);
4531     cmpl(super_check_offset.as_register(), sc_offset);
4532     if (L_failure == &L_fallthrough) {
4533       local_jcc(Assembler::equal, *L_slow_path);
4534     } else {
4535       local_jcc(Assembler::notEqual, *L_failure);
4536       final_jmp(*L_slow_path);
4537     }
4538   } else if (super_check_offset.as_constant() == sc_offset) {
4539     // Need a slow path; fast failure is impossible.
4540     if (L_slow_path == &L_fallthrough) {
4541       local_jcc(Assembler::equal, *L_success);
4542     } else {
4543       local_jcc(Assembler::notEqual, *L_slow_path);
4544       final_jmp(*L_success);
4545     }
4546   } else {
4547     // No slow path; it's a fast decision.
4548     if (L_failure == &L_fallthrough) {
4549       local_jcc(Assembler::equal, *L_success);
4550     } else {
4551       local_jcc(Assembler::notEqual, *L_failure);
4552       final_jmp(*L_success);
4553     }
4554   }
4555 
4556   bind(L_fallthrough);
4557 
4558 #undef local_jcc
4559 #undef final_jmp
4560 }
4561 
4562 
4563 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4564                                                    Register super_klass,
4565                                                    Register temp_reg,
4566                                                    Register temp2_reg,
4567                                                    Label* L_success,
4568                                                    Label* L_failure,
4569                                                    bool set_cond_codes) {
4570   assert_different_registers(sub_klass, super_klass, temp_reg);
4571   if (temp2_reg != noreg)
4572     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4573 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4574 
4575   Label L_fallthrough;
4576   int label_nulls = 0;
4577   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4578   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4579   assert(label_nulls <= 1, "at most one null in the batch");
4580 
4581   // a couple of useful fields in sub_klass:
4582   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4583   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4584   Address secondary_supers_addr(sub_klass, ss_offset);
4585   Address super_cache_addr(     sub_klass, sc_offset);
4586 
4587   // Do a linear scan of the secondary super-klass chain.
4588   // This code is rarely used, so simplicity is a virtue here.
4589   // The repne_scan instruction uses fixed registers, which we must spill.
4590   // Don't worry too much about pre-existing connections with the input regs.
4591 
4592   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4593   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4594 
4595   // Get super_klass value into rax (even if it was in rdi or rcx).
4596   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4597   if (super_klass != rax) {
4598     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4599     mov(rax, super_klass);
4600   }
4601   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4602   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4603 
4604 #ifndef PRODUCT
4605   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4606   ExternalAddress pst_counter_addr((address) pst_counter);
4607   NOT_LP64(  incrementl(pst_counter_addr) );
4608   LP64_ONLY( lea(rcx, pst_counter_addr) );
4609   LP64_ONLY( incrementl(Address(rcx, 0)) );
4610 #endif //PRODUCT
4611 
4612   // We will consult the secondary-super array.
4613   movptr(rdi, secondary_supers_addr);
4614   // Load the array length.  (Positive movl does right thing on LP64.)
4615   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4616   // Skip to start of data.
4617   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4618 
4619   // Scan RCX words at [RDI] for an occurrence of RAX.
4620   // Set NZ/Z based on last compare.
4621   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4622   // not change flags (only scas instruction which is repeated sets flags).
4623   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4624 
4625     testptr(rax,rax); // Set Z = 0
4626     repne_scan();
4627 
4628   // Unspill the temp. registers:
4629   if (pushed_rdi)  pop(rdi);
4630   if (pushed_rcx)  pop(rcx);
4631   if (pushed_rax)  pop(rax);
4632 
4633   if (set_cond_codes) {
4634     // Special hack for the AD files:  rdi is guaranteed non-zero.
4635     assert(!pushed_rdi, "rdi must be left non-null");
4636     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4637   }
4638 
4639   if (L_failure == &L_fallthrough)
4640         jccb(Assembler::notEqual, *L_failure);
4641   else  jcc(Assembler::notEqual, *L_failure);
4642 
4643   // Success.  Cache the super we found and proceed in triumph.
4644   movptr(super_cache_addr, super_klass);
4645 
4646   if (L_success != &L_fallthrough) {
4647     jmp(*L_success);
4648   }
4649 
4650 #undef IS_A_TEMP
4651 
4652   bind(L_fallthrough);
4653 }
4654 
4655 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4656   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
4657 
4658   Label L_fallthrough;
4659   if (L_fast_path == nullptr) {
4660     L_fast_path = &L_fallthrough;
4661   } else if (L_slow_path == nullptr) {
4662     L_slow_path = &L_fallthrough;
4663   }
4664 
4665   // Fast path check: class is fully initialized
4666   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4667   jcc(Assembler::equal, *L_fast_path);
4668 
4669   // Fast path check: current thread is initializer thread
4670   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4671   if (L_slow_path == &L_fallthrough) {
4672     jcc(Assembler::equal, *L_fast_path);
4673     bind(*L_slow_path);
4674   } else if (L_fast_path == &L_fallthrough) {
4675     jcc(Assembler::notEqual, *L_slow_path);
4676     bind(*L_fast_path);
4677   } else {
4678     Unimplemented();
4679   }
4680 }
4681 
4682 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4683   if (VM_Version::supports_cmov()) {
4684     cmovl(cc, dst, src);
4685   } else {
4686     Label L;
4687     jccb(negate_condition(cc), L);
4688     movl(dst, src);
4689     bind(L);
4690   }
4691 }
4692 
4693 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4694   if (VM_Version::supports_cmov()) {
4695     cmovl(cc, dst, src);
4696   } else {
4697     Label L;
4698     jccb(negate_condition(cc), L);
4699     movl(dst, src);
4700     bind(L);
4701   }
4702 }
4703 
4704 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4705   if (!VerifyOops) return;
4706 
4707   BLOCK_COMMENT("verify_oop {");
4708 #ifdef _LP64
4709   push(rscratch1);
4710 #endif
4711   push(rax);                          // save rax
4712   push(reg);                          // pass register argument
4713 
4714   // Pass register number to verify_oop_subroutine
4715   const char* b = nullptr;
4716   {
4717     ResourceMark rm;
4718     stringStream ss;
4719     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4720     b = code_string(ss.as_string());
4721   }
4722   ExternalAddress buffer((address) b);
4723   pushptr(buffer.addr(), rscratch1);
4724 
4725   // call indirectly to solve generation ordering problem
4726   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4727   call(rax);
4728   // Caller pops the arguments (oop, message) and restores rax, r10
4729   BLOCK_COMMENT("} verify_oop");
4730 }
4731 
4732 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4733   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4734     // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
4735     // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
4736     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4737   } else if (VM_Version::supports_avx()) {
4738     vpcmpeqd(dst, dst, dst, vector_len);
4739   } else {
4740     assert(VM_Version::supports_sse2(), "");
4741     pcmpeqd(dst, dst);
4742   }
4743 }
4744 
4745 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4746                                          int extra_slot_offset) {
4747   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4748   int stackElementSize = Interpreter::stackElementSize;
4749   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4750 #ifdef ASSERT
4751   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4752   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4753 #endif
4754   Register             scale_reg    = noreg;
4755   Address::ScaleFactor scale_factor = Address::no_scale;
4756   if (arg_slot.is_constant()) {
4757     offset += arg_slot.as_constant() * stackElementSize;
4758   } else {
4759     scale_reg    = arg_slot.as_register();
4760     scale_factor = Address::times(stackElementSize);
4761   }
4762   offset += wordSize;           // return PC is on stack
4763   return Address(rsp, scale_reg, scale_factor, offset);
4764 }
4765 
4766 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4767   if (!VerifyOops) return;
4768 
4769 #ifdef _LP64
4770   push(rscratch1);
4771 #endif
4772   push(rax); // save rax,
4773   // addr may contain rsp so we will have to adjust it based on the push
4774   // we just did (and on 64 bit we do two pushes)
4775   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4776   // stores rax into addr which is backwards of what was intended.
4777   if (addr.uses(rsp)) {
4778     lea(rax, addr);
4779     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4780   } else {
4781     pushptr(addr);
4782   }
4783 
4784   // Pass register number to verify_oop_subroutine
4785   const char* b = nullptr;
4786   {
4787     ResourceMark rm;
4788     stringStream ss;
4789     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4790     b = code_string(ss.as_string());
4791   }
4792   ExternalAddress buffer((address) b);
4793   pushptr(buffer.addr(), rscratch1);
4794 
4795   // call indirectly to solve generation ordering problem
4796   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4797   call(rax);
4798   // Caller pops the arguments (addr, message) and restores rax, r10.
4799 }
4800 
4801 void MacroAssembler::verify_tlab() {
4802 #ifdef ASSERT
4803   if (UseTLAB && VerifyOops) {
4804     Label next, ok;
4805     Register t1 = rsi;
4806     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4807 
4808     push(t1);
4809     NOT_LP64(push(thread_reg));
4810     NOT_LP64(get_thread(thread_reg));
4811 
4812     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4813     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4814     jcc(Assembler::aboveEqual, next);
4815     STOP("assert(top >= start)");
4816     should_not_reach_here();
4817 
4818     bind(next);
4819     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4820     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4821     jcc(Assembler::aboveEqual, ok);
4822     STOP("assert(top <= end)");
4823     should_not_reach_here();
4824 
4825     bind(ok);
4826     NOT_LP64(pop(thread_reg));
4827     pop(t1);
4828   }
4829 #endif
4830 }
4831 
4832 class ControlWord {
4833  public:
4834   int32_t _value;
4835 
4836   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4837   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4838   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4839   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4840   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4841   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4842   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4843   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4844 
4845   void print() const {
4846     // rounding control
4847     const char* rc;
4848     switch (rounding_control()) {
4849       case 0: rc = "round near"; break;
4850       case 1: rc = "round down"; break;
4851       case 2: rc = "round up  "; break;
4852       case 3: rc = "chop      "; break;
4853       default:
4854         rc = nullptr; // silence compiler warnings
4855         fatal("Unknown rounding control: %d", rounding_control());
4856     };
4857     // precision control
4858     const char* pc;
4859     switch (precision_control()) {
4860       case 0: pc = "24 bits "; break;
4861       case 1: pc = "reserved"; break;
4862       case 2: pc = "53 bits "; break;
4863       case 3: pc = "64 bits "; break;
4864       default:
4865         pc = nullptr; // silence compiler warnings
4866         fatal("Unknown precision control: %d", precision_control());
4867     };
4868     // flags
4869     char f[9];
4870     f[0] = ' ';
4871     f[1] = ' ';
4872     f[2] = (precision   ()) ? 'P' : 'p';
4873     f[3] = (underflow   ()) ? 'U' : 'u';
4874     f[4] = (overflow    ()) ? 'O' : 'o';
4875     f[5] = (zero_divide ()) ? 'Z' : 'z';
4876     f[6] = (denormalized()) ? 'D' : 'd';
4877     f[7] = (invalid     ()) ? 'I' : 'i';
4878     f[8] = '\x0';
4879     // output
4880     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4881   }
4882 
4883 };
4884 
4885 class StatusWord {
4886  public:
4887   int32_t _value;
4888 
4889   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4890   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4891   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4892   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4893   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4894   int  top() const                     { return  (_value >> 11) & 7      ; }
4895   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4896   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4897   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4898   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4899   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4900   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4901   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4902   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4903 
4904   void print() const {
4905     // condition codes
4906     char c[5];
4907     c[0] = (C3()) ? '3' : '-';
4908     c[1] = (C2()) ? '2' : '-';
4909     c[2] = (C1()) ? '1' : '-';
4910     c[3] = (C0()) ? '0' : '-';
4911     c[4] = '\x0';
4912     // flags
4913     char f[9];
4914     f[0] = (error_status()) ? 'E' : '-';
4915     f[1] = (stack_fault ()) ? 'S' : '-';
4916     f[2] = (precision   ()) ? 'P' : '-';
4917     f[3] = (underflow   ()) ? 'U' : '-';
4918     f[4] = (overflow    ()) ? 'O' : '-';
4919     f[5] = (zero_divide ()) ? 'Z' : '-';
4920     f[6] = (denormalized()) ? 'D' : '-';
4921     f[7] = (invalid     ()) ? 'I' : '-';
4922     f[8] = '\x0';
4923     // output
4924     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4925   }
4926 
4927 };
4928 
4929 class TagWord {
4930  public:
4931   int32_t _value;
4932 
4933   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4934 
4935   void print() const {
4936     printf("%04x", _value & 0xFFFF);
4937   }
4938 
4939 };
4940 
4941 class FPU_Register {
4942  public:
4943   int32_t _m0;
4944   int32_t _m1;
4945   int16_t _ex;
4946 
4947   bool is_indefinite() const           {
4948     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4949   }
4950 
4951   void print() const {
4952     char  sign = (_ex < 0) ? '-' : '+';
4953     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4954     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4955   };
4956 
4957 };
4958 
4959 class FPU_State {
4960  public:
4961   enum {
4962     register_size       = 10,
4963     number_of_registers =  8,
4964     register_mask       =  7
4965   };
4966 
4967   ControlWord  _control_word;
4968   StatusWord   _status_word;
4969   TagWord      _tag_word;
4970   int32_t      _error_offset;
4971   int32_t      _error_selector;
4972   int32_t      _data_offset;
4973   int32_t      _data_selector;
4974   int8_t       _register[register_size * number_of_registers];
4975 
4976   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4977   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4978 
4979   const char* tag_as_string(int tag) const {
4980     switch (tag) {
4981       case 0: return "valid";
4982       case 1: return "zero";
4983       case 2: return "special";
4984       case 3: return "empty";
4985     }
4986     ShouldNotReachHere();
4987     return nullptr;
4988   }
4989 
4990   void print() const {
4991     // print computation registers
4992     { int t = _status_word.top();
4993       for (int i = 0; i < number_of_registers; i++) {
4994         int j = (i - t) & register_mask;
4995         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4996         st(j)->print();
4997         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4998       }
4999     }
5000     printf("\n");
5001     // print control registers
5002     printf("ctrl = "); _control_word.print(); printf("\n");
5003     printf("stat = "); _status_word .print(); printf("\n");
5004     printf("tags = "); _tag_word    .print(); printf("\n");
5005   }
5006 
5007 };
5008 
5009 class Flag_Register {
5010  public:
5011   int32_t _value;
5012 
5013   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5014   bool direction() const               { return ((_value >> 10) & 1) != 0; }
5015   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5016   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5017   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5018   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5019   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5020 
5021   void print() const {
5022     // flags
5023     char f[8];
5024     f[0] = (overflow       ()) ? 'O' : '-';
5025     f[1] = (direction      ()) ? 'D' : '-';
5026     f[2] = (sign           ()) ? 'S' : '-';
5027     f[3] = (zero           ()) ? 'Z' : '-';
5028     f[4] = (auxiliary_carry()) ? 'A' : '-';
5029     f[5] = (parity         ()) ? 'P' : '-';
5030     f[6] = (carry          ()) ? 'C' : '-';
5031     f[7] = '\x0';
5032     // output
5033     printf("%08x  flags = %s", _value, f);
5034   }
5035 
5036 };
5037 
5038 class IU_Register {
5039  public:
5040   int32_t _value;
5041 
5042   void print() const {
5043     printf("%08x  %11d", _value, _value);
5044   }
5045 
5046 };
5047 
5048 class IU_State {
5049  public:
5050   Flag_Register _eflags;
5051   IU_Register   _rdi;
5052   IU_Register   _rsi;
5053   IU_Register   _rbp;
5054   IU_Register   _rsp;
5055   IU_Register   _rbx;
5056   IU_Register   _rdx;
5057   IU_Register   _rcx;
5058   IU_Register   _rax;
5059 
5060   void print() const {
5061     // computation registers
5062     printf("rax,  = "); _rax.print(); printf("\n");
5063     printf("rbx,  = "); _rbx.print(); printf("\n");
5064     printf("rcx  = "); _rcx.print(); printf("\n");
5065     printf("rdx  = "); _rdx.print(); printf("\n");
5066     printf("rdi  = "); _rdi.print(); printf("\n");
5067     printf("rsi  = "); _rsi.print(); printf("\n");
5068     printf("rbp,  = "); _rbp.print(); printf("\n");
5069     printf("rsp  = "); _rsp.print(); printf("\n");
5070     printf("\n");
5071     // control registers
5072     printf("flgs = "); _eflags.print(); printf("\n");
5073   }
5074 };
5075 
5076 
5077 class CPU_State {
5078  public:
5079   FPU_State _fpu_state;
5080   IU_State  _iu_state;
5081 
5082   void print() const {
5083     printf("--------------------------------------------------\n");
5084     _iu_state .print();
5085     printf("\n");
5086     _fpu_state.print();
5087     printf("--------------------------------------------------\n");
5088   }
5089 
5090 };
5091 
5092 
5093 static void _print_CPU_state(CPU_State* state) {
5094   state->print();
5095 };
5096 
5097 
5098 void MacroAssembler::print_CPU_state() {
5099   push_CPU_state();
5100   push(rsp);                // pass CPU state
5101   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5102   addptr(rsp, wordSize);       // discard argument
5103   pop_CPU_state();
5104 }
5105 
5106 
5107 #ifndef _LP64
5108 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5109   static int counter = 0;
5110   FPU_State* fs = &state->_fpu_state;
5111   counter++;
5112   // For leaf calls, only verify that the top few elements remain empty.
5113   // We only need 1 empty at the top for C2 code.
5114   if( stack_depth < 0 ) {
5115     if( fs->tag_for_st(7) != 3 ) {
5116       printf("FPR7 not empty\n");
5117       state->print();
5118       assert(false, "error");
5119       return false;
5120     }
5121     return true;                // All other stack states do not matter
5122   }
5123 
5124   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
5125          "bad FPU control word");
5126 
5127   // compute stack depth
5128   int i = 0;
5129   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5130   int d = i;
5131   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5132   // verify findings
5133   if (i != FPU_State::number_of_registers) {
5134     // stack not contiguous
5135     printf("%s: stack not contiguous at ST%d\n", s, i);
5136     state->print();
5137     assert(false, "error");
5138     return false;
5139   }
5140   // check if computed stack depth corresponds to expected stack depth
5141   if (stack_depth < 0) {
5142     // expected stack depth is -stack_depth or less
5143     if (d > -stack_depth) {
5144       // too many elements on the stack
5145       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5146       state->print();
5147       assert(false, "error");
5148       return false;
5149     }
5150   } else {
5151     // expected stack depth is stack_depth
5152     if (d != stack_depth) {
5153       // wrong stack depth
5154       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5155       state->print();
5156       assert(false, "error");
5157       return false;
5158     }
5159   }
5160   // everything is cool
5161   return true;
5162 }
5163 
5164 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5165   if (!VerifyFPU) return;
5166   push_CPU_state();
5167   push(rsp);                // pass CPU state
5168   ExternalAddress msg((address) s);
5169   // pass message string s
5170   pushptr(msg.addr(), noreg);
5171   push(stack_depth);        // pass stack depth
5172   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5173   addptr(rsp, 3 * wordSize);   // discard arguments
5174   // check for error
5175   { Label L;
5176     testl(rax, rax);
5177     jcc(Assembler::notZero, L);
5178     int3();                  // break if error condition
5179     bind(L);
5180   }
5181   pop_CPU_state();
5182 }
5183 #endif // _LP64
5184 
5185 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5186   // Either restore the MXCSR register after returning from the JNI Call
5187   // or verify that it wasn't changed (with -Xcheck:jni flag).
5188   if (VM_Version::supports_sse()) {
5189     if (RestoreMXCSROnJNICalls) {
5190       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5191     } else if (CheckJNICalls) {
5192       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5193     }
5194   }
5195   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5196   vzeroupper();
5197 
5198 #ifndef _LP64
5199   // Either restore the x87 floating pointer control word after returning
5200   // from the JNI call or verify that it wasn't changed.
5201   if (CheckJNICalls) {
5202     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5203   }
5204 #endif // _LP64
5205 }
5206 
5207 // ((OopHandle)result).resolve();
5208 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5209   assert_different_registers(result, tmp);
5210 
5211   // Only 64 bit platforms support GCs that require a tmp register
5212   // Only IN_HEAP loads require a thread_tmp register
5213   // OopHandle::resolve is an indirection like jobject.
5214   access_load_at(T_OBJECT, IN_NATIVE,
5215                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5216 }
5217 
5218 // ((WeakHandle)result).resolve();
5219 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5220   assert_different_registers(rresult, rtmp);
5221   Label resolved;
5222 
5223   // A null weak handle resolves to null.
5224   cmpptr(rresult, 0);
5225   jcc(Assembler::equal, resolved);
5226 
5227   // Only 64 bit platforms support GCs that require a tmp register
5228   // Only IN_HEAP loads require a thread_tmp register
5229   // WeakHandle::resolve is an indirection like jweak.
5230   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5231                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5232   bind(resolved);
5233 }
5234 
5235 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5236   // get mirror
5237   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5238   load_method_holder(mirror, method);
5239   movptr(mirror, Address(mirror, mirror_offset));
5240   resolve_oop_handle(mirror, tmp);
5241 }
5242 
5243 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5244   load_method_holder(rresult, rmethod);
5245   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5246 }
5247 
5248 void MacroAssembler::load_method_holder(Register holder, Register method) {
5249   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5250   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5251   movptr(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
5252 }
5253 
5254 #ifdef _LP64
5255 void MacroAssembler::load_nklass_compact(Register dst, Register src) {
5256   assert(UseCompactObjectHeaders, "expect compact object headers");
5257 
5258   Label fast;
5259   movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
5260   testb(dst, markWord::monitor_value);
5261   jccb(Assembler::zero, fast);
5262 
5263   // Fetch displaced header
5264   movq(dst, Address(dst, OM_OFFSET_NO_MONITOR_VALUE_TAG(header)));
5265 
5266   bind(fast);
5267   shrq(dst, markWord::klass_shift);
5268 }
5269 #endif
5270 
5271 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5272   assert_different_registers(src, tmp);
5273   assert_different_registers(dst, tmp);
5274 #ifdef _LP64
5275   if (UseCompactObjectHeaders) {
5276     load_nklass_compact(dst, src);
5277     decode_klass_not_null(dst, tmp);
5278   } else if (UseCompressedClassPointers) {
5279     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5280     decode_klass_not_null(dst, tmp);
5281   } else
5282 #endif
5283   {
5284     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5285   }
5286 }
5287 
5288 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5289   assert(!UseCompactObjectHeaders, "not with compact headers");
5290   assert_different_registers(src, tmp);
5291   assert_different_registers(dst, tmp);
5292 #ifdef _LP64
5293   if (UseCompressedClassPointers) {
5294     encode_klass_not_null(src, tmp);
5295     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5296   } else
5297 #endif
5298     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5299 }
5300 
5301 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) {
5302 #ifdef _LP64
5303   if (UseCompactObjectHeaders) {
5304     load_nklass_compact(tmp, obj);
5305     cmpl(klass, tmp);
5306   } else if (UseCompressedClassPointers) {
5307     cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
5308   } else
5309 #endif
5310   {
5311     cmpptr(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
5312   }
5313 }
5314 
5315 void MacroAssembler::cmp_klass(Register src, Register dst, Register tmp1, Register tmp2) {
5316 #ifdef _LP64
5317   if (UseCompactObjectHeaders) {
5318     assert(tmp2 != noreg, "need tmp2");
5319     assert_different_registers(src, dst, tmp1, tmp2);
5320     load_nklass_compact(tmp1, src);
5321     load_nklass_compact(tmp2, dst);
5322     cmpl(tmp1, tmp2);
5323   } else if (UseCompressedClassPointers) {
5324     movl(tmp1, Address(src, oopDesc::klass_offset_in_bytes()));
5325     cmpl(tmp1, Address(dst, oopDesc::klass_offset_in_bytes()));
5326   } else
5327 #endif
5328   {
5329     movptr(tmp1, Address(src, oopDesc::klass_offset_in_bytes()));
5330     cmpptr(tmp1, Address(dst, oopDesc::klass_offset_in_bytes()));
5331   }
5332 }
5333 
5334 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5335                                     Register tmp1, Register thread_tmp) {
5336   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5337   decorators = AccessInternal::decorator_fixup(decorators, type);
5338   bool as_raw = (decorators & AS_RAW) != 0;
5339   if (as_raw) {
5340     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5341   } else {
5342     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5343   }
5344 }
5345 
5346 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5347                                      Register tmp1, Register tmp2, Register tmp3) {
5348   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5349   decorators = AccessInternal::decorator_fixup(decorators, type);
5350   bool as_raw = (decorators & AS_RAW) != 0;
5351   if (as_raw) {
5352     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5353   } else {
5354     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5355   }
5356 }
5357 
5358 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5359                                    Register thread_tmp, DecoratorSet decorators) {
5360   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5361 }
5362 
5363 // Doesn't do verification, generates fixed size code
5364 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5365                                             Register thread_tmp, DecoratorSet decorators) {
5366   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5367 }
5368 
5369 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5370                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
5371   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5372 }
5373 
5374 // Used for storing nulls.
5375 void MacroAssembler::store_heap_oop_null(Address dst) {
5376   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5377 }
5378 
5379 #ifdef _LP64
5380 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5381   assert(!UseCompactObjectHeaders, "Don't use with compact headers");
5382   if (UseCompressedClassPointers) {
5383     // Store to klass gap in destination
5384     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5385   }
5386 }
5387 
5388 #ifdef ASSERT
5389 void MacroAssembler::verify_heapbase(const char* msg) {
5390   assert (UseCompressedOops, "should be compressed");
5391   assert (Universe::heap() != nullptr, "java heap should be initialized");
5392   if (CheckCompressedOops) {
5393     Label ok;
5394     ExternalAddress src2(CompressedOops::ptrs_base_addr());
5395     const bool is_src2_reachable = reachable(src2);
5396     if (!is_src2_reachable) {
5397       push(rscratch1);  // cmpptr trashes rscratch1
5398     }
5399     cmpptr(r12_heapbase, src2, rscratch1);
5400     jcc(Assembler::equal, ok);
5401     STOP(msg);
5402     bind(ok);
5403     if (!is_src2_reachable) {
5404       pop(rscratch1);
5405     }
5406   }
5407 }
5408 #endif
5409 
5410 // Algorithm must match oop.inline.hpp encode_heap_oop.
5411 void MacroAssembler::encode_heap_oop(Register r) {
5412 #ifdef ASSERT
5413   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5414 #endif
5415   verify_oop_msg(r, "broken oop in encode_heap_oop");
5416   if (CompressedOops::base() == nullptr) {
5417     if (CompressedOops::shift() != 0) {
5418       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5419       shrq(r, LogMinObjAlignmentInBytes);
5420     }
5421     return;
5422   }
5423   testq(r, r);
5424   cmovq(Assembler::equal, r, r12_heapbase);
5425   subq(r, r12_heapbase);
5426   shrq(r, LogMinObjAlignmentInBytes);
5427 }
5428 
5429 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5430 #ifdef ASSERT
5431   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5432   if (CheckCompressedOops) {
5433     Label ok;
5434     testq(r, r);
5435     jcc(Assembler::notEqual, ok);
5436     STOP("null oop passed to encode_heap_oop_not_null");
5437     bind(ok);
5438   }
5439 #endif
5440   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5441   if (CompressedOops::base() != nullptr) {
5442     subq(r, r12_heapbase);
5443   }
5444   if (CompressedOops::shift() != 0) {
5445     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5446     shrq(r, LogMinObjAlignmentInBytes);
5447   }
5448 }
5449 
5450 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5451 #ifdef ASSERT
5452   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5453   if (CheckCompressedOops) {
5454     Label ok;
5455     testq(src, src);
5456     jcc(Assembler::notEqual, ok);
5457     STOP("null oop passed to encode_heap_oop_not_null2");
5458     bind(ok);
5459   }
5460 #endif
5461   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5462   if (dst != src) {
5463     movq(dst, src);
5464   }
5465   if (CompressedOops::base() != nullptr) {
5466     subq(dst, r12_heapbase);
5467   }
5468   if (CompressedOops::shift() != 0) {
5469     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5470     shrq(dst, LogMinObjAlignmentInBytes);
5471   }
5472 }
5473 
5474 void  MacroAssembler::decode_heap_oop(Register r) {
5475 #ifdef ASSERT
5476   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5477 #endif
5478   if (CompressedOops::base() == nullptr) {
5479     if (CompressedOops::shift() != 0) {
5480       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5481       shlq(r, LogMinObjAlignmentInBytes);
5482     }
5483   } else {
5484     Label done;
5485     shlq(r, LogMinObjAlignmentInBytes);
5486     jccb(Assembler::equal, done);
5487     addq(r, r12_heapbase);
5488     bind(done);
5489   }
5490   verify_oop_msg(r, "broken oop in decode_heap_oop");
5491 }
5492 
5493 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5494   // Note: it will change flags
5495   assert (UseCompressedOops, "should only be used for compressed headers");
5496   assert (Universe::heap() != nullptr, "java heap should be initialized");
5497   // Cannot assert, unverified entry point counts instructions (see .ad file)
5498   // vtableStubs also counts instructions in pd_code_size_limit.
5499   // Also do not verify_oop as this is called by verify_oop.
5500   if (CompressedOops::shift() != 0) {
5501     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5502     shlq(r, LogMinObjAlignmentInBytes);
5503     if (CompressedOops::base() != nullptr) {
5504       addq(r, r12_heapbase);
5505     }
5506   } else {
5507     assert (CompressedOops::base() == nullptr, "sanity");
5508   }
5509 }
5510 
5511 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5512   // Note: it will change flags
5513   assert (UseCompressedOops, "should only be used for compressed headers");
5514   assert (Universe::heap() != nullptr, "java heap should be initialized");
5515   // Cannot assert, unverified entry point counts instructions (see .ad file)
5516   // vtableStubs also counts instructions in pd_code_size_limit.
5517   // Also do not verify_oop as this is called by verify_oop.
5518   if (CompressedOops::shift() != 0) {
5519     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5520     if (LogMinObjAlignmentInBytes == Address::times_8) {
5521       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5522     } else {
5523       if (dst != src) {
5524         movq(dst, src);
5525       }
5526       shlq(dst, LogMinObjAlignmentInBytes);
5527       if (CompressedOops::base() != nullptr) {
5528         addq(dst, r12_heapbase);
5529       }
5530     }
5531   } else {
5532     assert (CompressedOops::base() == nullptr, "sanity");
5533     if (dst != src) {
5534       movq(dst, src);
5535     }
5536   }
5537 }
5538 
5539 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5540   assert_different_registers(r, tmp);
5541   if (CompressedKlassPointers::base() != nullptr) {
5542     mov64(tmp, (int64_t)CompressedKlassPointers::base());
5543     subq(r, tmp);
5544   }
5545   if (CompressedKlassPointers::shift() != 0) {
5546     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5547     shrq(r, LogKlassAlignmentInBytes);
5548   }
5549 }
5550 
5551 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5552   assert_different_registers(src, dst);
5553   if (CompressedKlassPointers::base() != nullptr) {
5554     mov64(dst, -(int64_t)CompressedKlassPointers::base());
5555     addq(dst, src);
5556   } else {
5557     movptr(dst, src);
5558   }
5559   if (CompressedKlassPointers::shift() != 0) {
5560     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5561     shrq(dst, LogKlassAlignmentInBytes);
5562   }
5563 }
5564 
5565 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5566   assert_different_registers(r, tmp);
5567   // Note: it will change flags
5568   assert(UseCompressedClassPointers, "should only be used for compressed headers");
5569   // Cannot assert, unverified entry point counts instructions (see .ad file)
5570   // vtableStubs also counts instructions in pd_code_size_limit.
5571   // Also do not verify_oop as this is called by verify_oop.
5572   if (CompressedKlassPointers::shift() != 0) {
5573     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5574     shlq(r, LogKlassAlignmentInBytes);
5575   }
5576   if (CompressedKlassPointers::base() != nullptr) {
5577     mov64(tmp, (int64_t)CompressedKlassPointers::base());
5578     addq(r, tmp);
5579   }
5580 }
5581 
5582 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5583   assert_different_registers(src, dst);
5584   // Note: it will change flags
5585   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5586   // Cannot assert, unverified entry point counts instructions (see .ad file)
5587   // vtableStubs also counts instructions in pd_code_size_limit.
5588   // Also do not verify_oop as this is called by verify_oop.
5589 
5590   if (CompressedKlassPointers::base() == nullptr &&
5591       CompressedKlassPointers::shift() == 0) {
5592     // The best case scenario is that there is no base or shift. Then it is already
5593     // a pointer that needs nothing but a register rename.
5594     movl(dst, src);
5595   } else {
5596     if (CompressedKlassPointers::base() != nullptr) {
5597       mov64(dst, (int64_t)CompressedKlassPointers::base());
5598     } else {
5599       xorq(dst, dst);
5600     }
5601     if (CompressedKlassPointers::shift() != 0) {
5602       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5603       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5604       leaq(dst, Address(dst, src, Address::times_8, 0));
5605     } else {
5606       addq(dst, src);
5607     }
5608   }
5609 }
5610 
5611 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5612   assert (UseCompressedOops, "should only be used for compressed headers");
5613   assert (Universe::heap() != nullptr, "java heap should be initialized");
5614   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5615   int oop_index = oop_recorder()->find_index(obj);
5616   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5617   mov_narrow_oop(dst, oop_index, rspec);
5618 }
5619 
5620 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5621   assert (UseCompressedOops, "should only be used for compressed headers");
5622   assert (Universe::heap() != nullptr, "java heap should be initialized");
5623   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5624   int oop_index = oop_recorder()->find_index(obj);
5625   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5626   mov_narrow_oop(dst, oop_index, rspec);
5627 }
5628 
5629 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5630   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5631   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5632   int klass_index = oop_recorder()->find_index(k);
5633   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5634   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5635 }
5636 
5637 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5638   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5639   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5640   int klass_index = oop_recorder()->find_index(k);
5641   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5642   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5643 }
5644 
5645 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5646   assert (UseCompressedOops, "should only be used for compressed headers");
5647   assert (Universe::heap() != nullptr, "java heap should be initialized");
5648   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5649   int oop_index = oop_recorder()->find_index(obj);
5650   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5651   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5652 }
5653 
5654 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5655   assert (UseCompressedOops, "should only be used for compressed headers");
5656   assert (Universe::heap() != nullptr, "java heap should be initialized");
5657   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5658   int oop_index = oop_recorder()->find_index(obj);
5659   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5660   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5661 }
5662 
5663 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5664   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5665   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5666   int klass_index = oop_recorder()->find_index(k);
5667   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5668   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5669 }
5670 
5671 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5672   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5673   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5674   int klass_index = oop_recorder()->find_index(k);
5675   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5676   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5677 }
5678 
5679 void MacroAssembler::reinit_heapbase() {
5680   if (UseCompressedOops) {
5681     if (Universe::heap() != nullptr) {
5682       if (CompressedOops::base() == nullptr) {
5683         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5684       } else {
5685         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5686       }
5687     } else {
5688       movptr(r12_heapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
5689     }
5690   }
5691 }
5692 
5693 #endif // _LP64
5694 
5695 #if COMPILER2_OR_JVMCI
5696 
5697 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5698 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5699   // cnt - number of qwords (8-byte words).
5700   // base - start address, qword aligned.
5701   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5702   bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
5703   if (use64byteVector) {
5704     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5705   } else if (MaxVectorSize >= 32) {
5706     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5707   } else {
5708     pxor(xtmp, xtmp);
5709   }
5710   jmp(L_zero_64_bytes);
5711 
5712   BIND(L_loop);
5713   if (MaxVectorSize >= 32) {
5714     fill64(base, 0, xtmp, use64byteVector);
5715   } else {
5716     movdqu(Address(base,  0), xtmp);
5717     movdqu(Address(base, 16), xtmp);
5718     movdqu(Address(base, 32), xtmp);
5719     movdqu(Address(base, 48), xtmp);
5720   }
5721   addptr(base, 64);
5722 
5723   BIND(L_zero_64_bytes);
5724   subptr(cnt, 8);
5725   jccb(Assembler::greaterEqual, L_loop);
5726 
5727   // Copy trailing 64 bytes
5728   if (use64byteVector) {
5729     addptr(cnt, 8);
5730     jccb(Assembler::equal, L_end);
5731     fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5732     jmp(L_end);
5733   } else {
5734     addptr(cnt, 4);
5735     jccb(Assembler::less, L_tail);
5736     if (MaxVectorSize >= 32) {
5737       vmovdqu(Address(base, 0), xtmp);
5738     } else {
5739       movdqu(Address(base,  0), xtmp);
5740       movdqu(Address(base, 16), xtmp);
5741     }
5742   }
5743   addptr(base, 32);
5744   subptr(cnt, 4);
5745 
5746   BIND(L_tail);
5747   addptr(cnt, 4);
5748   jccb(Assembler::lessEqual, L_end);
5749   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5750     fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5751   } else {
5752     decrement(cnt);
5753 
5754     BIND(L_sloop);
5755     movq(Address(base, 0), xtmp);
5756     addptr(base, 8);
5757     decrement(cnt);
5758     jccb(Assembler::greaterEqual, L_sloop);
5759   }
5760   BIND(L_end);
5761 }
5762 
5763 // Clearing constant sized memory using YMM/ZMM registers.
5764 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5765   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5766   bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
5767 
5768   int vector64_count = (cnt & (~0x7)) >> 3;
5769   cnt = cnt & 0x7;
5770   const int fill64_per_loop = 4;
5771   const int max_unrolled_fill64 = 8;
5772 
5773   // 64 byte initialization loop.
5774   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5775   int start64 = 0;
5776   if (vector64_count > max_unrolled_fill64) {
5777     Label LOOP;
5778     Register index = rtmp;
5779 
5780     start64 = vector64_count - (vector64_count % fill64_per_loop);
5781 
5782     movl(index, 0);
5783     BIND(LOOP);
5784     for (int i = 0; i < fill64_per_loop; i++) {
5785       fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5786     }
5787     addl(index, fill64_per_loop * 64);
5788     cmpl(index, start64 * 64);
5789     jccb(Assembler::less, LOOP);
5790   }
5791   for (int i = start64; i < vector64_count; i++) {
5792     fill64(base, i * 64, xtmp, use64byteVector);
5793   }
5794 
5795   // Clear remaining 64 byte tail.
5796   int disp = vector64_count * 64;
5797   if (cnt) {
5798     switch (cnt) {
5799       case 1:
5800         movq(Address(base, disp), xtmp);
5801         break;
5802       case 2:
5803         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
5804         break;
5805       case 3:
5806         movl(rtmp, 0x7);
5807         kmovwl(mask, rtmp);
5808         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
5809         break;
5810       case 4:
5811         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5812         break;
5813       case 5:
5814         if (use64byteVector) {
5815           movl(rtmp, 0x1F);
5816           kmovwl(mask, rtmp);
5817           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5818         } else {
5819           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5820           movq(Address(base, disp + 32), xtmp);
5821         }
5822         break;
5823       case 6:
5824         if (use64byteVector) {
5825           movl(rtmp, 0x3F);
5826           kmovwl(mask, rtmp);
5827           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5828         } else {
5829           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5830           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
5831         }
5832         break;
5833       case 7:
5834         if (use64byteVector) {
5835           movl(rtmp, 0x7F);
5836           kmovwl(mask, rtmp);
5837           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5838         } else {
5839           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5840           movl(rtmp, 0x7);
5841           kmovwl(mask, rtmp);
5842           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
5843         }
5844         break;
5845       default:
5846         fatal("Unexpected length : %d\n",cnt);
5847         break;
5848     }
5849   }
5850 }
5851 
5852 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5853                                bool is_large, KRegister mask) {
5854   // cnt      - number of qwords (8-byte words).
5855   // base     - start address, qword aligned.
5856   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5857   assert(base==rdi, "base register must be edi for rep stos");
5858   assert(tmp==rax,   "tmp register must be eax for rep stos");
5859   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5860   assert(InitArrayShortSize % BytesPerLong == 0,
5861     "InitArrayShortSize should be the multiple of BytesPerLong");
5862 
5863   Label DONE;
5864   if (!is_large || !UseXMMForObjInit) {
5865     xorptr(tmp, tmp);
5866   }
5867 
5868   if (!is_large) {
5869     Label LOOP, LONG;
5870     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5871     jccb(Assembler::greater, LONG);
5872 
5873     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5874 
5875     decrement(cnt);
5876     jccb(Assembler::negative, DONE); // Zero length
5877 
5878     // Use individual pointer-sized stores for small counts:
5879     BIND(LOOP);
5880     movptr(Address(base, cnt, Address::times_ptr), tmp);
5881     decrement(cnt);
5882     jccb(Assembler::greaterEqual, LOOP);
5883     jmpb(DONE);
5884 
5885     BIND(LONG);
5886   }
5887 
5888   // Use longer rep-prefixed ops for non-small counts:
5889   if (UseFastStosb) {
5890     shlptr(cnt, 3); // convert to number of bytes
5891     rep_stosb();
5892   } else if (UseXMMForObjInit) {
5893     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5894   } else {
5895     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5896     rep_stos();
5897   }
5898 
5899   BIND(DONE);
5900 }
5901 
5902 #endif //COMPILER2_OR_JVMCI
5903 
5904 
5905 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5906                                    Register to, Register value, Register count,
5907                                    Register rtmp, XMMRegister xtmp) {
5908   ShortBranchVerifier sbv(this);
5909   assert_different_registers(to, value, count, rtmp);
5910   Label L_exit;
5911   Label L_fill_2_bytes, L_fill_4_bytes;
5912 
5913 #if defined(COMPILER2) && defined(_LP64)
5914   if(MaxVectorSize >=32 &&
5915      VM_Version::supports_avx512vlbw() &&
5916      VM_Version::supports_bmi2()) {
5917     generate_fill_avx3(t, to, value, count, rtmp, xtmp);
5918     return;
5919   }
5920 #endif
5921 
5922   int shift = -1;
5923   switch (t) {
5924     case T_BYTE:
5925       shift = 2;
5926       break;
5927     case T_SHORT:
5928       shift = 1;
5929       break;
5930     case T_INT:
5931       shift = 0;
5932       break;
5933     default: ShouldNotReachHere();
5934   }
5935 
5936   if (t == T_BYTE) {
5937     andl(value, 0xff);
5938     movl(rtmp, value);
5939     shll(rtmp, 8);
5940     orl(value, rtmp);
5941   }
5942   if (t == T_SHORT) {
5943     andl(value, 0xffff);
5944   }
5945   if (t == T_BYTE || t == T_SHORT) {
5946     movl(rtmp, value);
5947     shll(rtmp, 16);
5948     orl(value, rtmp);
5949   }
5950 
5951   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5952   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5953   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5954     Label L_skip_align2;
5955     // align source address at 4 bytes address boundary
5956     if (t == T_BYTE) {
5957       Label L_skip_align1;
5958       // One byte misalignment happens only for byte arrays
5959       testptr(to, 1);
5960       jccb(Assembler::zero, L_skip_align1);
5961       movb(Address(to, 0), value);
5962       increment(to);
5963       decrement(count);
5964       BIND(L_skip_align1);
5965     }
5966     // Two bytes misalignment happens only for byte and short (char) arrays
5967     testptr(to, 2);
5968     jccb(Assembler::zero, L_skip_align2);
5969     movw(Address(to, 0), value);
5970     addptr(to, 2);
5971     subl(count, 1<<(shift-1));
5972     BIND(L_skip_align2);
5973   }
5974   if (UseSSE < 2) {
5975     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5976     // Fill 32-byte chunks
5977     subl(count, 8 << shift);
5978     jcc(Assembler::less, L_check_fill_8_bytes);
5979     align(16);
5980 
5981     BIND(L_fill_32_bytes_loop);
5982 
5983     for (int i = 0; i < 32; i += 4) {
5984       movl(Address(to, i), value);
5985     }
5986 
5987     addptr(to, 32);
5988     subl(count, 8 << shift);
5989     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5990     BIND(L_check_fill_8_bytes);
5991     addl(count, 8 << shift);
5992     jccb(Assembler::zero, L_exit);
5993     jmpb(L_fill_8_bytes);
5994 
5995     //
5996     // length is too short, just fill qwords
5997     //
5998     BIND(L_fill_8_bytes_loop);
5999     movl(Address(to, 0), value);
6000     movl(Address(to, 4), value);
6001     addptr(to, 8);
6002     BIND(L_fill_8_bytes);
6003     subl(count, 1 << (shift + 1));
6004     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6005     // fall through to fill 4 bytes
6006   } else {
6007     Label L_fill_32_bytes;
6008     if (!UseUnalignedLoadStores) {
6009       // align to 8 bytes, we know we are 4 byte aligned to start
6010       testptr(to, 4);
6011       jccb(Assembler::zero, L_fill_32_bytes);
6012       movl(Address(to, 0), value);
6013       addptr(to, 4);
6014       subl(count, 1<<shift);
6015     }
6016     BIND(L_fill_32_bytes);
6017     {
6018       assert( UseSSE >= 2, "supported cpu only" );
6019       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6020       movdl(xtmp, value);
6021       if (UseAVX >= 2 && UseUnalignedLoadStores) {
6022         Label L_check_fill_32_bytes;
6023         if (UseAVX > 2) {
6024           // Fill 64-byte chunks
6025           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
6026 
6027           // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
6028           cmpl(count, VM_Version::avx3_threshold());
6029           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
6030 
6031           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
6032 
6033           subl(count, 16 << shift);
6034           jccb(Assembler::less, L_check_fill_32_bytes);
6035           align(16);
6036 
6037           BIND(L_fill_64_bytes_loop_avx3);
6038           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
6039           addptr(to, 64);
6040           subl(count, 16 << shift);
6041           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
6042           jmpb(L_check_fill_32_bytes);
6043 
6044           BIND(L_check_fill_64_bytes_avx2);
6045         }
6046         // Fill 64-byte chunks
6047         Label L_fill_64_bytes_loop;
6048         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
6049 
6050         subl(count, 16 << shift);
6051         jcc(Assembler::less, L_check_fill_32_bytes);
6052         align(16);
6053 
6054         BIND(L_fill_64_bytes_loop);
6055         vmovdqu(Address(to, 0), xtmp);
6056         vmovdqu(Address(to, 32), xtmp);
6057         addptr(to, 64);
6058         subl(count, 16 << shift);
6059         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6060 
6061         BIND(L_check_fill_32_bytes);
6062         addl(count, 8 << shift);
6063         jccb(Assembler::less, L_check_fill_8_bytes);
6064         vmovdqu(Address(to, 0), xtmp);
6065         addptr(to, 32);
6066         subl(count, 8 << shift);
6067 
6068         BIND(L_check_fill_8_bytes);
6069         // clean upper bits of YMM registers
6070         movdl(xtmp, value);
6071         pshufd(xtmp, xtmp, 0);
6072       } else {
6073         // Fill 32-byte chunks
6074         pshufd(xtmp, xtmp, 0);
6075 
6076         subl(count, 8 << shift);
6077         jcc(Assembler::less, L_check_fill_8_bytes);
6078         align(16);
6079 
6080         BIND(L_fill_32_bytes_loop);
6081 
6082         if (UseUnalignedLoadStores) {
6083           movdqu(Address(to, 0), xtmp);
6084           movdqu(Address(to, 16), xtmp);
6085         } else {
6086           movq(Address(to, 0), xtmp);
6087           movq(Address(to, 8), xtmp);
6088           movq(Address(to, 16), xtmp);
6089           movq(Address(to, 24), xtmp);
6090         }
6091 
6092         addptr(to, 32);
6093         subl(count, 8 << shift);
6094         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6095 
6096         BIND(L_check_fill_8_bytes);
6097       }
6098       addl(count, 8 << shift);
6099       jccb(Assembler::zero, L_exit);
6100       jmpb(L_fill_8_bytes);
6101 
6102       //
6103       // length is too short, just fill qwords
6104       //
6105       BIND(L_fill_8_bytes_loop);
6106       movq(Address(to, 0), xtmp);
6107       addptr(to, 8);
6108       BIND(L_fill_8_bytes);
6109       subl(count, 1 << (shift + 1));
6110       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6111     }
6112   }
6113   // fill trailing 4 bytes
6114   BIND(L_fill_4_bytes);
6115   testl(count, 1<<shift);
6116   jccb(Assembler::zero, L_fill_2_bytes);
6117   movl(Address(to, 0), value);
6118   if (t == T_BYTE || t == T_SHORT) {
6119     Label L_fill_byte;
6120     addptr(to, 4);
6121     BIND(L_fill_2_bytes);
6122     // fill trailing 2 bytes
6123     testl(count, 1<<(shift-1));
6124     jccb(Assembler::zero, L_fill_byte);
6125     movw(Address(to, 0), value);
6126     if (t == T_BYTE) {
6127       addptr(to, 2);
6128       BIND(L_fill_byte);
6129       // fill trailing byte
6130       testl(count, 1);
6131       jccb(Assembler::zero, L_exit);
6132       movb(Address(to, 0), value);
6133     } else {
6134       BIND(L_fill_byte);
6135     }
6136   } else {
6137     BIND(L_fill_2_bytes);
6138   }
6139   BIND(L_exit);
6140 }
6141 
6142 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
6143   switch(type) {
6144     case T_BYTE:
6145     case T_BOOLEAN:
6146       evpbroadcastb(dst, src, vector_len);
6147       break;
6148     case T_SHORT:
6149     case T_CHAR:
6150       evpbroadcastw(dst, src, vector_len);
6151       break;
6152     case T_INT:
6153     case T_FLOAT:
6154       evpbroadcastd(dst, src, vector_len);
6155       break;
6156     case T_LONG:
6157     case T_DOUBLE:
6158       evpbroadcastq(dst, src, vector_len);
6159       break;
6160     default:
6161       fatal("Unhandled type : %s", type2name(type));
6162       break;
6163   }
6164 }
6165 
6166 // encode char[] to byte[] in ISO_8859_1 or ASCII
6167    //@IntrinsicCandidate
6168    //private static int implEncodeISOArray(byte[] sa, int sp,
6169    //byte[] da, int dp, int len) {
6170    //  int i = 0;
6171    //  for (; i < len; i++) {
6172    //    char c = StringUTF16.getChar(sa, sp++);
6173    //    if (c > '\u00FF')
6174    //      break;
6175    //    da[dp++] = (byte)c;
6176    //  }
6177    //  return i;
6178    //}
6179    //
6180    //@IntrinsicCandidate
6181    //private static int implEncodeAsciiArray(char[] sa, int sp,
6182    //    byte[] da, int dp, int len) {
6183    //  int i = 0;
6184    //  for (; i < len; i++) {
6185    //    char c = sa[sp++];
6186    //    if (c >= '\u0080')
6187    //      break;
6188    //    da[dp++] = (byte)c;
6189    //  }
6190    //  return i;
6191    //}
6192 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6193   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6194   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6195   Register tmp5, Register result, bool ascii) {
6196 
6197   // rsi: src
6198   // rdi: dst
6199   // rdx: len
6200   // rcx: tmp5
6201   // rax: result
6202   ShortBranchVerifier sbv(this);
6203   assert_different_registers(src, dst, len, tmp5, result);
6204   Label L_done, L_copy_1_char, L_copy_1_char_exit;
6205 
6206   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
6207   int short_mask = ascii ? 0xff80 : 0xff00;
6208 
6209   // set result
6210   xorl(result, result);
6211   // check for zero length
6212   testl(len, len);
6213   jcc(Assembler::zero, L_done);
6214 
6215   movl(result, len);
6216 
6217   // Setup pointers
6218   lea(src, Address(src, len, Address::times_2)); // char[]
6219   lea(dst, Address(dst, len, Address::times_1)); // byte[]
6220   negptr(len);
6221 
6222   if (UseSSE42Intrinsics || UseAVX >= 2) {
6223     Label L_copy_8_chars, L_copy_8_chars_exit;
6224     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6225 
6226     if (UseAVX >= 2) {
6227       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6228       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
6229       movdl(tmp1Reg, tmp5);
6230       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
6231       jmp(L_chars_32_check);
6232 
6233       bind(L_copy_32_chars);
6234       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6235       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6236       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6237       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
6238       jccb(Assembler::notZero, L_copy_32_chars_exit);
6239       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6240       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
6241       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6242 
6243       bind(L_chars_32_check);
6244       addptr(len, 32);
6245       jcc(Assembler::lessEqual, L_copy_32_chars);
6246 
6247       bind(L_copy_32_chars_exit);
6248       subptr(len, 16);
6249       jccb(Assembler::greater, L_copy_16_chars_exit);
6250 
6251     } else if (UseSSE42Intrinsics) {
6252       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
6253       movdl(tmp1Reg, tmp5);
6254       pshufd(tmp1Reg, tmp1Reg, 0);
6255       jmpb(L_chars_16_check);
6256     }
6257 
6258     bind(L_copy_16_chars);
6259     if (UseAVX >= 2) {
6260       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6261       vptest(tmp2Reg, tmp1Reg);
6262       jcc(Assembler::notZero, L_copy_16_chars_exit);
6263       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
6264       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
6265     } else {
6266       if (UseAVX > 0) {
6267         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6268         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6269         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
6270       } else {
6271         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6272         por(tmp2Reg, tmp3Reg);
6273         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6274         por(tmp2Reg, tmp4Reg);
6275       }
6276       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
6277       jccb(Assembler::notZero, L_copy_16_chars_exit);
6278       packuswb(tmp3Reg, tmp4Reg);
6279     }
6280     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6281 
6282     bind(L_chars_16_check);
6283     addptr(len, 16);
6284     jcc(Assembler::lessEqual, L_copy_16_chars);
6285 
6286     bind(L_copy_16_chars_exit);
6287     if (UseAVX >= 2) {
6288       // clean upper bits of YMM registers
6289       vpxor(tmp2Reg, tmp2Reg);
6290       vpxor(tmp3Reg, tmp3Reg);
6291       vpxor(tmp4Reg, tmp4Reg);
6292       movdl(tmp1Reg, tmp5);
6293       pshufd(tmp1Reg, tmp1Reg, 0);
6294     }
6295     subptr(len, 8);
6296     jccb(Assembler::greater, L_copy_8_chars_exit);
6297 
6298     bind(L_copy_8_chars);
6299     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6300     ptest(tmp3Reg, tmp1Reg);
6301     jccb(Assembler::notZero, L_copy_8_chars_exit);
6302     packuswb(tmp3Reg, tmp1Reg);
6303     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6304     addptr(len, 8);
6305     jccb(Assembler::lessEqual, L_copy_8_chars);
6306 
6307     bind(L_copy_8_chars_exit);
6308     subptr(len, 8);
6309     jccb(Assembler::zero, L_done);
6310   }
6311 
6312   bind(L_copy_1_char);
6313   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6314   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
6315   jccb(Assembler::notZero, L_copy_1_char_exit);
6316   movb(Address(dst, len, Address::times_1, 0), tmp5);
6317   addptr(len, 1);
6318   jccb(Assembler::less, L_copy_1_char);
6319 
6320   bind(L_copy_1_char_exit);
6321   addptr(result, len); // len is negative count of not processed elements
6322 
6323   bind(L_done);
6324 }
6325 
6326 #ifdef _LP64
6327 /**
6328  * Helper for multiply_to_len().
6329  */
6330 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6331   addq(dest_lo, src1);
6332   adcq(dest_hi, 0);
6333   addq(dest_lo, src2);
6334   adcq(dest_hi, 0);
6335 }
6336 
6337 /**
6338  * Multiply 64 bit by 64 bit first loop.
6339  */
6340 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6341                                            Register y, Register y_idx, Register z,
6342                                            Register carry, Register product,
6343                                            Register idx, Register kdx) {
6344   //
6345   //  jlong carry, x[], y[], z[];
6346   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6347   //    huge_128 product = y[idx] * x[xstart] + carry;
6348   //    z[kdx] = (jlong)product;
6349   //    carry  = (jlong)(product >>> 64);
6350   //  }
6351   //  z[xstart] = carry;
6352   //
6353 
6354   Label L_first_loop, L_first_loop_exit;
6355   Label L_one_x, L_one_y, L_multiply;
6356 
6357   decrementl(xstart);
6358   jcc(Assembler::negative, L_one_x);
6359 
6360   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6361   rorq(x_xstart, 32); // convert big-endian to little-endian
6362 
6363   bind(L_first_loop);
6364   decrementl(idx);
6365   jcc(Assembler::negative, L_first_loop_exit);
6366   decrementl(idx);
6367   jcc(Assembler::negative, L_one_y);
6368   movq(y_idx, Address(y, idx, Address::times_4,  0));
6369   rorq(y_idx, 32); // convert big-endian to little-endian
6370   bind(L_multiply);
6371   movq(product, x_xstart);
6372   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
6373   addq(product, carry);
6374   adcq(rdx, 0);
6375   subl(kdx, 2);
6376   movl(Address(z, kdx, Address::times_4,  4), product);
6377   shrq(product, 32);
6378   movl(Address(z, kdx, Address::times_4,  0), product);
6379   movq(carry, rdx);
6380   jmp(L_first_loop);
6381 
6382   bind(L_one_y);
6383   movl(y_idx, Address(y,  0));
6384   jmp(L_multiply);
6385 
6386   bind(L_one_x);
6387   movl(x_xstart, Address(x,  0));
6388   jmp(L_first_loop);
6389 
6390   bind(L_first_loop_exit);
6391 }
6392 
6393 /**
6394  * Multiply 64 bit by 64 bit and add 128 bit.
6395  */
6396 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
6397                                             Register yz_idx, Register idx,
6398                                             Register carry, Register product, int offset) {
6399   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
6400   //     z[kdx] = (jlong)product;
6401 
6402   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
6403   rorq(yz_idx, 32); // convert big-endian to little-endian
6404   movq(product, x_xstart);
6405   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
6406   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
6407   rorq(yz_idx, 32); // convert big-endian to little-endian
6408 
6409   add2_with_carry(rdx, product, carry, yz_idx);
6410 
6411   movl(Address(z, idx, Address::times_4,  offset+4), product);
6412   shrq(product, 32);
6413   movl(Address(z, idx, Address::times_4,  offset), product);
6414 
6415 }
6416 
6417 /**
6418  * Multiply 128 bit by 128 bit. Unrolled inner loop.
6419  */
6420 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
6421                                              Register yz_idx, Register idx, Register jdx,
6422                                              Register carry, Register product,
6423                                              Register carry2) {
6424   //   jlong carry, x[], y[], z[];
6425   //   int kdx = ystart+1;
6426   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6427   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
6428   //     z[kdx+idx+1] = (jlong)product;
6429   //     jlong carry2  = (jlong)(product >>> 64);
6430   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6431   //     z[kdx+idx] = (jlong)product;
6432   //     carry  = (jlong)(product >>> 64);
6433   //   }
6434   //   idx += 2;
6435   //   if (idx > 0) {
6436   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6437   //     z[kdx+idx] = (jlong)product;
6438   //     carry  = (jlong)(product >>> 64);
6439   //   }
6440   //
6441 
6442   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6443 
6444   movl(jdx, idx);
6445   andl(jdx, 0xFFFFFFFC);
6446   shrl(jdx, 2);
6447 
6448   bind(L_third_loop);
6449   subl(jdx, 1);
6450   jcc(Assembler::negative, L_third_loop_exit);
6451   subl(idx, 4);
6452 
6453   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6454   movq(carry2, rdx);
6455 
6456   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6457   movq(carry, rdx);
6458   jmp(L_third_loop);
6459 
6460   bind (L_third_loop_exit);
6461 
6462   andl (idx, 0x3);
6463   jcc(Assembler::zero, L_post_third_loop_done);
6464 
6465   Label L_check_1;
6466   subl(idx, 2);
6467   jcc(Assembler::negative, L_check_1);
6468 
6469   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6470   movq(carry, rdx);
6471 
6472   bind (L_check_1);
6473   addl (idx, 0x2);
6474   andl (idx, 0x1);
6475   subl(idx, 1);
6476   jcc(Assembler::negative, L_post_third_loop_done);
6477 
6478   movl(yz_idx, Address(y, idx, Address::times_4,  0));
6479   movq(product, x_xstart);
6480   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6481   movl(yz_idx, Address(z, idx, Address::times_4,  0));
6482 
6483   add2_with_carry(rdx, product, yz_idx, carry);
6484 
6485   movl(Address(z, idx, Address::times_4,  0), product);
6486   shrq(product, 32);
6487 
6488   shlq(rdx, 32);
6489   orq(product, rdx);
6490   movq(carry, product);
6491 
6492   bind(L_post_third_loop_done);
6493 }
6494 
6495 /**
6496  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6497  *
6498  */
6499 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6500                                                   Register carry, Register carry2,
6501                                                   Register idx, Register jdx,
6502                                                   Register yz_idx1, Register yz_idx2,
6503                                                   Register tmp, Register tmp3, Register tmp4) {
6504   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6505 
6506   //   jlong carry, x[], y[], z[];
6507   //   int kdx = ystart+1;
6508   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6509   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6510   //     jlong carry2  = (jlong)(tmp3 >>> 64);
6511   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
6512   //     carry  = (jlong)(tmp4 >>> 64);
6513   //     z[kdx+idx+1] = (jlong)tmp3;
6514   //     z[kdx+idx] = (jlong)tmp4;
6515   //   }
6516   //   idx += 2;
6517   //   if (idx > 0) {
6518   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6519   //     z[kdx+idx] = (jlong)yz_idx1;
6520   //     carry  = (jlong)(yz_idx1 >>> 64);
6521   //   }
6522   //
6523 
6524   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6525 
6526   movl(jdx, idx);
6527   andl(jdx, 0xFFFFFFFC);
6528   shrl(jdx, 2);
6529 
6530   bind(L_third_loop);
6531   subl(jdx, 1);
6532   jcc(Assembler::negative, L_third_loop_exit);
6533   subl(idx, 4);
6534 
6535   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
6536   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6537   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
6538   rorxq(yz_idx2, yz_idx2, 32);
6539 
6540   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
6541   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
6542 
6543   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
6544   rorxq(yz_idx1, yz_idx1, 32);
6545   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6546   rorxq(yz_idx2, yz_idx2, 32);
6547 
6548   if (VM_Version::supports_adx()) {
6549     adcxq(tmp3, carry);
6550     adoxq(tmp3, yz_idx1);
6551 
6552     adcxq(tmp4, tmp);
6553     adoxq(tmp4, yz_idx2);
6554 
6555     movl(carry, 0); // does not affect flags
6556     adcxq(carry2, carry);
6557     adoxq(carry2, carry);
6558   } else {
6559     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6560     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6561   }
6562   movq(carry, carry2);
6563 
6564   movl(Address(z, idx, Address::times_4, 12), tmp3);
6565   shrq(tmp3, 32);
6566   movl(Address(z, idx, Address::times_4,  8), tmp3);
6567 
6568   movl(Address(z, idx, Address::times_4,  4), tmp4);
6569   shrq(tmp4, 32);
6570   movl(Address(z, idx, Address::times_4,  0), tmp4);
6571 
6572   jmp(L_third_loop);
6573 
6574   bind (L_third_loop_exit);
6575 
6576   andl (idx, 0x3);
6577   jcc(Assembler::zero, L_post_third_loop_done);
6578 
6579   Label L_check_1;
6580   subl(idx, 2);
6581   jcc(Assembler::negative, L_check_1);
6582 
6583   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
6584   rorxq(yz_idx1, yz_idx1, 32);
6585   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
6586   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6587   rorxq(yz_idx2, yz_idx2, 32);
6588 
6589   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6590 
6591   movl(Address(z, idx, Address::times_4,  4), tmp3);
6592   shrq(tmp3, 32);
6593   movl(Address(z, idx, Address::times_4,  0), tmp3);
6594   movq(carry, tmp4);
6595 
6596   bind (L_check_1);
6597   addl (idx, 0x2);
6598   andl (idx, 0x1);
6599   subl(idx, 1);
6600   jcc(Assembler::negative, L_post_third_loop_done);
6601   movl(tmp4, Address(y, idx, Address::times_4,  0));
6602   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
6603   movl(tmp4, Address(z, idx, Address::times_4,  0));
6604 
6605   add2_with_carry(carry2, tmp3, tmp4, carry);
6606 
6607   movl(Address(z, idx, Address::times_4,  0), tmp3);
6608   shrq(tmp3, 32);
6609 
6610   shlq(carry2, 32);
6611   orq(tmp3, carry2);
6612   movq(carry, tmp3);
6613 
6614   bind(L_post_third_loop_done);
6615 }
6616 
6617 /**
6618  * Code for BigInteger::multiplyToLen() intrinsic.
6619  *
6620  * rdi: x
6621  * rax: xlen
6622  * rsi: y
6623  * rcx: ylen
6624  * r8:  z
6625  * r11: zlen
6626  * r12: tmp1
6627  * r13: tmp2
6628  * r14: tmp3
6629  * r15: tmp4
6630  * rbx: tmp5
6631  *
6632  */
6633 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6634                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6635   ShortBranchVerifier sbv(this);
6636   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6637 
6638   push(tmp1);
6639   push(tmp2);
6640   push(tmp3);
6641   push(tmp4);
6642   push(tmp5);
6643 
6644   push(xlen);
6645   push(zlen);
6646 
6647   const Register idx = tmp1;
6648   const Register kdx = tmp2;
6649   const Register xstart = tmp3;
6650 
6651   const Register y_idx = tmp4;
6652   const Register carry = tmp5;
6653   const Register product  = xlen;
6654   const Register x_xstart = zlen;  // reuse register
6655 
6656   // First Loop.
6657   //
6658   //  final static long LONG_MASK = 0xffffffffL;
6659   //  int xstart = xlen - 1;
6660   //  int ystart = ylen - 1;
6661   //  long carry = 0;
6662   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6663   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6664   //    z[kdx] = (int)product;
6665   //    carry = product >>> 32;
6666   //  }
6667   //  z[xstart] = (int)carry;
6668   //
6669 
6670   movl(idx, ylen);      // idx = ylen;
6671   movl(kdx, zlen);      // kdx = xlen+ylen;
6672   xorq(carry, carry);   // carry = 0;
6673 
6674   Label L_done;
6675 
6676   movl(xstart, xlen);
6677   decrementl(xstart);
6678   jcc(Assembler::negative, L_done);
6679 
6680   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6681 
6682   Label L_second_loop;
6683   testl(kdx, kdx);
6684   jcc(Assembler::zero, L_second_loop);
6685 
6686   Label L_carry;
6687   subl(kdx, 1);
6688   jcc(Assembler::zero, L_carry);
6689 
6690   movl(Address(z, kdx, Address::times_4,  0), carry);
6691   shrq(carry, 32);
6692   subl(kdx, 1);
6693 
6694   bind(L_carry);
6695   movl(Address(z, kdx, Address::times_4,  0), carry);
6696 
6697   // Second and third (nested) loops.
6698   //
6699   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6700   //   carry = 0;
6701   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6702   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6703   //                    (z[k] & LONG_MASK) + carry;
6704   //     z[k] = (int)product;
6705   //     carry = product >>> 32;
6706   //   }
6707   //   z[i] = (int)carry;
6708   // }
6709   //
6710   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6711 
6712   const Register jdx = tmp1;
6713 
6714   bind(L_second_loop);
6715   xorl(carry, carry);    // carry = 0;
6716   movl(jdx, ylen);       // j = ystart+1
6717 
6718   subl(xstart, 1);       // i = xstart-1;
6719   jcc(Assembler::negative, L_done);
6720 
6721   push (z);
6722 
6723   Label L_last_x;
6724   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6725   subl(xstart, 1);       // i = xstart-1;
6726   jcc(Assembler::negative, L_last_x);
6727 
6728   if (UseBMI2Instructions) {
6729     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6730     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6731   } else {
6732     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6733     rorq(x_xstart, 32);  // convert big-endian to little-endian
6734   }
6735 
6736   Label L_third_loop_prologue;
6737   bind(L_third_loop_prologue);
6738 
6739   push (x);
6740   push (xstart);
6741   push (ylen);
6742 
6743 
6744   if (UseBMI2Instructions) {
6745     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6746   } else { // !UseBMI2Instructions
6747     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6748   }
6749 
6750   pop(ylen);
6751   pop(xlen);
6752   pop(x);
6753   pop(z);
6754 
6755   movl(tmp3, xlen);
6756   addl(tmp3, 1);
6757   movl(Address(z, tmp3, Address::times_4,  0), carry);
6758   subl(tmp3, 1);
6759   jccb(Assembler::negative, L_done);
6760 
6761   shrq(carry, 32);
6762   movl(Address(z, tmp3, Address::times_4,  0), carry);
6763   jmp(L_second_loop);
6764 
6765   // Next infrequent code is moved outside loops.
6766   bind(L_last_x);
6767   if (UseBMI2Instructions) {
6768     movl(rdx, Address(x,  0));
6769   } else {
6770     movl(x_xstart, Address(x,  0));
6771   }
6772   jmp(L_third_loop_prologue);
6773 
6774   bind(L_done);
6775 
6776   pop(zlen);
6777   pop(xlen);
6778 
6779   pop(tmp5);
6780   pop(tmp4);
6781   pop(tmp3);
6782   pop(tmp2);
6783   pop(tmp1);
6784 }
6785 
6786 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6787   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6788   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6789   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6790   Label VECTOR8_TAIL, VECTOR4_TAIL;
6791   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6792   Label SAME_TILL_END, DONE;
6793   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6794 
6795   //scale is in rcx in both Win64 and Unix
6796   ShortBranchVerifier sbv(this);
6797 
6798   shlq(length);
6799   xorq(result, result);
6800 
6801   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6802       VM_Version::supports_avx512vlbw()) {
6803     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6804 
6805     cmpq(length, 64);
6806     jcc(Assembler::less, VECTOR32_TAIL);
6807 
6808     movq(tmp1, length);
6809     andq(tmp1, 0x3F);      // tail count
6810     andq(length, ~(0x3F)); //vector count
6811 
6812     bind(VECTOR64_LOOP);
6813     // AVX512 code to compare 64 byte vectors.
6814     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
6815     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6816     kortestql(k7, k7);
6817     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6818     addq(result, 64);
6819     subq(length, 64);
6820     jccb(Assembler::notZero, VECTOR64_LOOP);
6821 
6822     //bind(VECTOR64_TAIL);
6823     testq(tmp1, tmp1);
6824     jcc(Assembler::zero, SAME_TILL_END);
6825 
6826     //bind(VECTOR64_TAIL);
6827     // AVX512 code to compare up to 63 byte vectors.
6828     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6829     shlxq(tmp2, tmp2, tmp1);
6830     notq(tmp2);
6831     kmovql(k3, tmp2);
6832 
6833     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6834     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6835 
6836     ktestql(k7, k3);
6837     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6838 
6839     bind(VECTOR64_NOT_EQUAL);
6840     kmovql(tmp1, k7);
6841     notq(tmp1);
6842     tzcntq(tmp1, tmp1);
6843     addq(result, tmp1);
6844     shrq(result);
6845     jmp(DONE);
6846     bind(VECTOR32_TAIL);
6847   }
6848 
6849   cmpq(length, 8);
6850   jcc(Assembler::equal, VECTOR8_LOOP);
6851   jcc(Assembler::less, VECTOR4_TAIL);
6852 
6853   if (UseAVX >= 2) {
6854     Label VECTOR16_TAIL, VECTOR32_LOOP;
6855 
6856     cmpq(length, 16);
6857     jcc(Assembler::equal, VECTOR16_LOOP);
6858     jcc(Assembler::less, VECTOR8_LOOP);
6859 
6860     cmpq(length, 32);
6861     jccb(Assembler::less, VECTOR16_TAIL);
6862 
6863     subq(length, 32);
6864     bind(VECTOR32_LOOP);
6865     vmovdqu(rymm0, Address(obja, result));
6866     vmovdqu(rymm1, Address(objb, result));
6867     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6868     vptest(rymm2, rymm2);
6869     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6870     addq(result, 32);
6871     subq(length, 32);
6872     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6873     addq(length, 32);
6874     jcc(Assembler::equal, SAME_TILL_END);
6875     //falling through if less than 32 bytes left //close the branch here.
6876 
6877     bind(VECTOR16_TAIL);
6878     cmpq(length, 16);
6879     jccb(Assembler::less, VECTOR8_TAIL);
6880     bind(VECTOR16_LOOP);
6881     movdqu(rymm0, Address(obja, result));
6882     movdqu(rymm1, Address(objb, result));
6883     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6884     ptest(rymm2, rymm2);
6885     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6886     addq(result, 16);
6887     subq(length, 16);
6888     jcc(Assembler::equal, SAME_TILL_END);
6889     //falling through if less than 16 bytes left
6890   } else {//regular intrinsics
6891 
6892     cmpq(length, 16);
6893     jccb(Assembler::less, VECTOR8_TAIL);
6894 
6895     subq(length, 16);
6896     bind(VECTOR16_LOOP);
6897     movdqu(rymm0, Address(obja, result));
6898     movdqu(rymm1, Address(objb, result));
6899     pxor(rymm0, rymm1);
6900     ptest(rymm0, rymm0);
6901     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6902     addq(result, 16);
6903     subq(length, 16);
6904     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6905     addq(length, 16);
6906     jcc(Assembler::equal, SAME_TILL_END);
6907     //falling through if less than 16 bytes left
6908   }
6909 
6910   bind(VECTOR8_TAIL);
6911   cmpq(length, 8);
6912   jccb(Assembler::less, VECTOR4_TAIL);
6913   bind(VECTOR8_LOOP);
6914   movq(tmp1, Address(obja, result));
6915   movq(tmp2, Address(objb, result));
6916   xorq(tmp1, tmp2);
6917   testq(tmp1, tmp1);
6918   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6919   addq(result, 8);
6920   subq(length, 8);
6921   jcc(Assembler::equal, SAME_TILL_END);
6922   //falling through if less than 8 bytes left
6923 
6924   bind(VECTOR4_TAIL);
6925   cmpq(length, 4);
6926   jccb(Assembler::less, BYTES_TAIL);
6927   bind(VECTOR4_LOOP);
6928   movl(tmp1, Address(obja, result));
6929   xorl(tmp1, Address(objb, result));
6930   testl(tmp1, tmp1);
6931   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6932   addq(result, 4);
6933   subq(length, 4);
6934   jcc(Assembler::equal, SAME_TILL_END);
6935   //falling through if less than 4 bytes left
6936 
6937   bind(BYTES_TAIL);
6938   bind(BYTES_LOOP);
6939   load_unsigned_byte(tmp1, Address(obja, result));
6940   load_unsigned_byte(tmp2, Address(objb, result));
6941   xorl(tmp1, tmp2);
6942   testl(tmp1, tmp1);
6943   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6944   decq(length);
6945   jcc(Assembler::zero, SAME_TILL_END);
6946   incq(result);
6947   load_unsigned_byte(tmp1, Address(obja, result));
6948   load_unsigned_byte(tmp2, Address(objb, result));
6949   xorl(tmp1, tmp2);
6950   testl(tmp1, tmp1);
6951   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6952   decq(length);
6953   jcc(Assembler::zero, SAME_TILL_END);
6954   incq(result);
6955   load_unsigned_byte(tmp1, Address(obja, result));
6956   load_unsigned_byte(tmp2, Address(objb, result));
6957   xorl(tmp1, tmp2);
6958   testl(tmp1, tmp1);
6959   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6960   jmp(SAME_TILL_END);
6961 
6962   if (UseAVX >= 2) {
6963     bind(VECTOR32_NOT_EQUAL);
6964     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6965     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6966     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6967     vpmovmskb(tmp1, rymm0);
6968     bsfq(tmp1, tmp1);
6969     addq(result, tmp1);
6970     shrq(result);
6971     jmp(DONE);
6972   }
6973 
6974   bind(VECTOR16_NOT_EQUAL);
6975   if (UseAVX >= 2) {
6976     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6977     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6978     pxor(rymm0, rymm2);
6979   } else {
6980     pcmpeqb(rymm2, rymm2);
6981     pxor(rymm0, rymm1);
6982     pcmpeqb(rymm0, rymm1);
6983     pxor(rymm0, rymm2);
6984   }
6985   pmovmskb(tmp1, rymm0);
6986   bsfq(tmp1, tmp1);
6987   addq(result, tmp1);
6988   shrq(result);
6989   jmpb(DONE);
6990 
6991   bind(VECTOR8_NOT_EQUAL);
6992   bind(VECTOR4_NOT_EQUAL);
6993   bsfq(tmp1, tmp1);
6994   shrq(tmp1, 3);
6995   addq(result, tmp1);
6996   bind(BYTES_NOT_EQUAL);
6997   shrq(result);
6998   jmpb(DONE);
6999 
7000   bind(SAME_TILL_END);
7001   mov64(result, -1);
7002 
7003   bind(DONE);
7004 }
7005 
7006 //Helper functions for square_to_len()
7007 
7008 /**
7009  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7010  * Preserves x and z and modifies rest of the registers.
7011  */
7012 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7013   // Perform square and right shift by 1
7014   // Handle odd xlen case first, then for even xlen do the following
7015   // jlong carry = 0;
7016   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7017   //     huge_128 product = x[j:j+1] * x[j:j+1];
7018   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7019   //     z[i+2:i+3] = (jlong)(product >>> 1);
7020   //     carry = (jlong)product;
7021   // }
7022 
7023   xorq(tmp5, tmp5);     // carry
7024   xorq(rdxReg, rdxReg);
7025   xorl(tmp1, tmp1);     // index for x
7026   xorl(tmp4, tmp4);     // index for z
7027 
7028   Label L_first_loop, L_first_loop_exit;
7029 
7030   testl(xlen, 1);
7031   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7032 
7033   // Square and right shift by 1 the odd element using 32 bit multiply
7034   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7035   imulq(raxReg, raxReg);
7036   shrq(raxReg, 1);
7037   adcq(tmp5, 0);
7038   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7039   incrementl(tmp1);
7040   addl(tmp4, 2);
7041 
7042   // Square and  right shift by 1 the rest using 64 bit multiply
7043   bind(L_first_loop);
7044   cmpptr(tmp1, xlen);
7045   jccb(Assembler::equal, L_first_loop_exit);
7046 
7047   // Square
7048   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
7049   rorq(raxReg, 32);    // convert big-endian to little-endian
7050   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
7051 
7052   // Right shift by 1 and save carry
7053   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7054   rcrq(rdxReg, 1);
7055   rcrq(raxReg, 1);
7056   adcq(tmp5, 0);
7057 
7058   // Store result in z
7059   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7060   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7061 
7062   // Update indices for x and z
7063   addl(tmp1, 2);
7064   addl(tmp4, 4);
7065   jmp(L_first_loop);
7066 
7067   bind(L_first_loop_exit);
7068 }
7069 
7070 
7071 /**
7072  * Perform the following multiply add operation using BMI2 instructions
7073  * carry:sum = sum + op1*op2 + carry
7074  * op2 should be in rdx
7075  * op2 is preserved, all other registers are modified
7076  */
7077 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7078   // assert op2 is rdx
7079   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
7080   addq(sum, carry);
7081   adcq(tmp2, 0);
7082   addq(sum, op1);
7083   adcq(tmp2, 0);
7084   movq(carry, tmp2);
7085 }
7086 
7087 /**
7088  * Perform the following multiply add operation:
7089  * carry:sum = sum + op1*op2 + carry
7090  * Preserves op1, op2 and modifies rest of registers
7091  */
7092 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7093   // rdx:rax = op1 * op2
7094   movq(raxReg, op2);
7095   mulq(op1);
7096 
7097   //  rdx:rax = sum + carry + rdx:rax
7098   addq(sum, carry);
7099   adcq(rdxReg, 0);
7100   addq(sum, raxReg);
7101   adcq(rdxReg, 0);
7102 
7103   // carry:sum = rdx:sum
7104   movq(carry, rdxReg);
7105 }
7106 
7107 /**
7108  * Add 64 bit long carry into z[] with carry propagation.
7109  * Preserves z and carry register values and modifies rest of registers.
7110  *
7111  */
7112 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7113   Label L_fourth_loop, L_fourth_loop_exit;
7114 
7115   movl(tmp1, 1);
7116   subl(zlen, 2);
7117   addq(Address(z, zlen, Address::times_4, 0), carry);
7118 
7119   bind(L_fourth_loop);
7120   jccb(Assembler::carryClear, L_fourth_loop_exit);
7121   subl(zlen, 2);
7122   jccb(Assembler::negative, L_fourth_loop_exit);
7123   addq(Address(z, zlen, Address::times_4, 0), tmp1);
7124   jmp(L_fourth_loop);
7125   bind(L_fourth_loop_exit);
7126 }
7127 
7128 /**
7129  * Shift z[] left by 1 bit.
7130  * Preserves x, len, z and zlen registers and modifies rest of the registers.
7131  *
7132  */
7133 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7134 
7135   Label L_fifth_loop, L_fifth_loop_exit;
7136 
7137   // Fifth loop
7138   // Perform primitiveLeftShift(z, zlen, 1)
7139 
7140   const Register prev_carry = tmp1;
7141   const Register new_carry = tmp4;
7142   const Register value = tmp2;
7143   const Register zidx = tmp3;
7144 
7145   // int zidx, carry;
7146   // long value;
7147   // carry = 0;
7148   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7149   //    (carry:value)  = (z[i] << 1) | carry ;
7150   //    z[i] = value;
7151   // }
7152 
7153   movl(zidx, zlen);
7154   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7155 
7156   bind(L_fifth_loop);
7157   decl(zidx);  // Use decl to preserve carry flag
7158   decl(zidx);
7159   jccb(Assembler::negative, L_fifth_loop_exit);
7160 
7161   if (UseBMI2Instructions) {
7162      movq(value, Address(z, zidx, Address::times_4, 0));
7163      rclq(value, 1);
7164      rorxq(value, value, 32);
7165      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7166   }
7167   else {
7168     // clear new_carry
7169     xorl(new_carry, new_carry);
7170 
7171     // Shift z[i] by 1, or in previous carry and save new carry
7172     movq(value, Address(z, zidx, Address::times_4, 0));
7173     shlq(value, 1);
7174     adcl(new_carry, 0);
7175 
7176     orq(value, prev_carry);
7177     rorq(value, 0x20);
7178     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7179 
7180     // Set previous carry = new carry
7181     movl(prev_carry, new_carry);
7182   }
7183   jmp(L_fifth_loop);
7184 
7185   bind(L_fifth_loop_exit);
7186 }
7187 
7188 
7189 /**
7190  * Code for BigInteger::squareToLen() intrinsic
7191  *
7192  * rdi: x
7193  * rsi: len
7194  * r8:  z
7195  * rcx: zlen
7196  * r12: tmp1
7197  * r13: tmp2
7198  * r14: tmp3
7199  * r15: tmp4
7200  * rbx: tmp5
7201  *
7202  */
7203 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7204 
7205   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
7206   push(tmp1);
7207   push(tmp2);
7208   push(tmp3);
7209   push(tmp4);
7210   push(tmp5);
7211 
7212   // First loop
7213   // Store the squares, right shifted one bit (i.e., divided by 2).
7214   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7215 
7216   // Add in off-diagonal sums.
7217   //
7218   // Second, third (nested) and fourth loops.
7219   // zlen +=2;
7220   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7221   //    carry = 0;
7222   //    long op2 = x[xidx:xidx+1];
7223   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7224   //       k -= 2;
7225   //       long op1 = x[j:j+1];
7226   //       long sum = z[k:k+1];
7227   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7228   //       z[k:k+1] = sum;
7229   //    }
7230   //    add_one_64(z, k, carry, tmp_regs);
7231   // }
7232 
7233   const Register carry = tmp5;
7234   const Register sum = tmp3;
7235   const Register op1 = tmp4;
7236   Register op2 = tmp2;
7237 
7238   push(zlen);
7239   push(len);
7240   addl(zlen,2);
7241   bind(L_second_loop);
7242   xorq(carry, carry);
7243   subl(zlen, 4);
7244   subl(len, 2);
7245   push(zlen);
7246   push(len);
7247   cmpl(len, 0);
7248   jccb(Assembler::lessEqual, L_second_loop_exit);
7249 
7250   // Multiply an array by one 64 bit long.
7251   if (UseBMI2Instructions) {
7252     op2 = rdxReg;
7253     movq(op2, Address(x, len, Address::times_4,  0));
7254     rorxq(op2, op2, 32);
7255   }
7256   else {
7257     movq(op2, Address(x, len, Address::times_4,  0));
7258     rorq(op2, 32);
7259   }
7260 
7261   bind(L_third_loop);
7262   decrementl(len);
7263   jccb(Assembler::negative, L_third_loop_exit);
7264   decrementl(len);
7265   jccb(Assembler::negative, L_last_x);
7266 
7267   movq(op1, Address(x, len, Address::times_4,  0));
7268   rorq(op1, 32);
7269 
7270   bind(L_multiply);
7271   subl(zlen, 2);
7272   movq(sum, Address(z, zlen, Address::times_4,  0));
7273 
7274   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
7275   if (UseBMI2Instructions) {
7276     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
7277   }
7278   else {
7279     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7280   }
7281 
7282   movq(Address(z, zlen, Address::times_4, 0), sum);
7283 
7284   jmp(L_third_loop);
7285   bind(L_third_loop_exit);
7286 
7287   // Fourth loop
7288   // Add 64 bit long carry into z with carry propagation.
7289   // Uses offsetted zlen.
7290   add_one_64(z, zlen, carry, tmp1);
7291 
7292   pop(len);
7293   pop(zlen);
7294   jmp(L_second_loop);
7295 
7296   // Next infrequent code is moved outside loops.
7297   bind(L_last_x);
7298   movl(op1, Address(x, 0));
7299   jmp(L_multiply);
7300 
7301   bind(L_second_loop_exit);
7302   pop(len);
7303   pop(zlen);
7304   pop(len);
7305   pop(zlen);
7306 
7307   // Fifth loop
7308   // Shift z left 1 bit.
7309   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7310 
7311   // z[zlen-1] |= x[len-1] & 1;
7312   movl(tmp3, Address(x, len, Address::times_4, -4));
7313   andl(tmp3, 1);
7314   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
7315 
7316   pop(tmp5);
7317   pop(tmp4);
7318   pop(tmp3);
7319   pop(tmp2);
7320   pop(tmp1);
7321 }
7322 
7323 /**
7324  * Helper function for mul_add()
7325  * Multiply the in[] by int k and add to out[] starting at offset offs using
7326  * 128 bit by 32 bit multiply and return the carry in tmp5.
7327  * Only quad int aligned length of in[] is operated on in this function.
7328  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7329  * This function preserves out, in and k registers.
7330  * len and offset point to the appropriate index in "in" & "out" correspondingly
7331  * tmp5 has the carry.
7332  * other registers are temporary and are modified.
7333  *
7334  */
7335 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7336   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7337   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7338 
7339   Label L_first_loop, L_first_loop_exit;
7340 
7341   movl(tmp1, len);
7342   shrl(tmp1, 2);
7343 
7344   bind(L_first_loop);
7345   subl(tmp1, 1);
7346   jccb(Assembler::negative, L_first_loop_exit);
7347 
7348   subl(len, 4);
7349   subl(offset, 4);
7350 
7351   Register op2 = tmp2;
7352   const Register sum = tmp3;
7353   const Register op1 = tmp4;
7354   const Register carry = tmp5;
7355 
7356   if (UseBMI2Instructions) {
7357     op2 = rdxReg;
7358   }
7359 
7360   movq(op1, Address(in, len, Address::times_4,  8));
7361   rorq(op1, 32);
7362   movq(sum, Address(out, offset, Address::times_4,  8));
7363   rorq(sum, 32);
7364   if (UseBMI2Instructions) {
7365     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7366   }
7367   else {
7368     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7369   }
7370   // Store back in big endian from little endian
7371   rorq(sum, 0x20);
7372   movq(Address(out, offset, Address::times_4,  8), sum);
7373 
7374   movq(op1, Address(in, len, Address::times_4,  0));
7375   rorq(op1, 32);
7376   movq(sum, Address(out, offset, Address::times_4,  0));
7377   rorq(sum, 32);
7378   if (UseBMI2Instructions) {
7379     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7380   }
7381   else {
7382     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7383   }
7384   // Store back in big endian from little endian
7385   rorq(sum, 0x20);
7386   movq(Address(out, offset, Address::times_4,  0), sum);
7387 
7388   jmp(L_first_loop);
7389   bind(L_first_loop_exit);
7390 }
7391 
7392 /**
7393  * Code for BigInteger::mulAdd() intrinsic
7394  *
7395  * rdi: out
7396  * rsi: in
7397  * r11: offs (out.length - offset)
7398  * rcx: len
7399  * r8:  k
7400  * r12: tmp1
7401  * r13: tmp2
7402  * r14: tmp3
7403  * r15: tmp4
7404  * rbx: tmp5
7405  * Multiply the in[] by word k and add to out[], return the carry in rax
7406  */
7407 void MacroAssembler::mul_add(Register out, Register in, Register offs,
7408    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
7409    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7410 
7411   Label L_carry, L_last_in, L_done;
7412 
7413 // carry = 0;
7414 // for (int j=len-1; j >= 0; j--) {
7415 //    long product = (in[j] & LONG_MASK) * kLong +
7416 //                   (out[offs] & LONG_MASK) + carry;
7417 //    out[offs--] = (int)product;
7418 //    carry = product >>> 32;
7419 // }
7420 //
7421   push(tmp1);
7422   push(tmp2);
7423   push(tmp3);
7424   push(tmp4);
7425   push(tmp5);
7426 
7427   Register op2 = tmp2;
7428   const Register sum = tmp3;
7429   const Register op1 = tmp4;
7430   const Register carry =  tmp5;
7431 
7432   if (UseBMI2Instructions) {
7433     op2 = rdxReg;
7434     movl(op2, k);
7435   }
7436   else {
7437     movl(op2, k);
7438   }
7439 
7440   xorq(carry, carry);
7441 
7442   //First loop
7443 
7444   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7445   //The carry is in tmp5
7446   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7447 
7448   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7449   decrementl(len);
7450   jccb(Assembler::negative, L_carry);
7451   decrementl(len);
7452   jccb(Assembler::negative, L_last_in);
7453 
7454   movq(op1, Address(in, len, Address::times_4,  0));
7455   rorq(op1, 32);
7456 
7457   subl(offs, 2);
7458   movq(sum, Address(out, offs, Address::times_4,  0));
7459   rorq(sum, 32);
7460 
7461   if (UseBMI2Instructions) {
7462     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7463   }
7464   else {
7465     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7466   }
7467 
7468   // Store back in big endian from little endian
7469   rorq(sum, 0x20);
7470   movq(Address(out, offs, Address::times_4,  0), sum);
7471 
7472   testl(len, len);
7473   jccb(Assembler::zero, L_carry);
7474 
7475   //Multiply the last in[] entry, if any
7476   bind(L_last_in);
7477   movl(op1, Address(in, 0));
7478   movl(sum, Address(out, offs, Address::times_4,  -4));
7479 
7480   movl(raxReg, k);
7481   mull(op1); //tmp4 * eax -> edx:eax
7482   addl(sum, carry);
7483   adcl(rdxReg, 0);
7484   addl(sum, raxReg);
7485   adcl(rdxReg, 0);
7486   movl(carry, rdxReg);
7487 
7488   movl(Address(out, offs, Address::times_4,  -4), sum);
7489 
7490   bind(L_carry);
7491   //return tmp5/carry as carry in rax
7492   movl(rax, carry);
7493 
7494   bind(L_done);
7495   pop(tmp5);
7496   pop(tmp4);
7497   pop(tmp3);
7498   pop(tmp2);
7499   pop(tmp1);
7500 }
7501 #endif
7502 
7503 /**
7504  * Emits code to update CRC-32 with a byte value according to constants in table
7505  *
7506  * @param [in,out]crc   Register containing the crc.
7507  * @param [in]val       Register containing the byte to fold into the CRC.
7508  * @param [in]table     Register containing the table of crc constants.
7509  *
7510  * uint32_t crc;
7511  * val = crc_table[(val ^ crc) & 0xFF];
7512  * crc = val ^ (crc >> 8);
7513  *
7514  */
7515 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7516   xorl(val, crc);
7517   andl(val, 0xFF);
7518   shrl(crc, 8); // unsigned shift
7519   xorl(crc, Address(table, val, Address::times_4, 0));
7520 }
7521 
7522 /**
7523  * Fold 128-bit data chunk
7524  */
7525 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7526   if (UseAVX > 0) {
7527     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7528     vpclmulldq(xcrc, xK, xcrc); // [63:0]
7529     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7530     pxor(xcrc, xtmp);
7531   } else {
7532     movdqa(xtmp, xcrc);
7533     pclmulhdq(xtmp, xK);   // [123:64]
7534     pclmulldq(xcrc, xK);   // [63:0]
7535     pxor(xcrc, xtmp);
7536     movdqu(xtmp, Address(buf, offset));
7537     pxor(xcrc, xtmp);
7538   }
7539 }
7540 
7541 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7542   if (UseAVX > 0) {
7543     vpclmulhdq(xtmp, xK, xcrc);
7544     vpclmulldq(xcrc, xK, xcrc);
7545     pxor(xcrc, xbuf);
7546     pxor(xcrc, xtmp);
7547   } else {
7548     movdqa(xtmp, xcrc);
7549     pclmulhdq(xtmp, xK);
7550     pclmulldq(xcrc, xK);
7551     pxor(xcrc, xbuf);
7552     pxor(xcrc, xtmp);
7553   }
7554 }
7555 
7556 /**
7557  * 8-bit folds to compute 32-bit CRC
7558  *
7559  * uint64_t xcrc;
7560  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7561  */
7562 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7563   movdl(tmp, xcrc);
7564   andl(tmp, 0xFF);
7565   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7566   psrldq(xcrc, 1); // unsigned shift one byte
7567   pxor(xcrc, xtmp);
7568 }
7569 
7570 /**
7571  * uint32_t crc;
7572  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7573  */
7574 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7575   movl(tmp, crc);
7576   andl(tmp, 0xFF);
7577   shrl(crc, 8);
7578   xorl(crc, Address(table, tmp, Address::times_4, 0));
7579 }
7580 
7581 /**
7582  * @param crc   register containing existing CRC (32-bit)
7583  * @param buf   register pointing to input byte buffer (byte*)
7584  * @param len   register containing number of bytes
7585  * @param table register that will contain address of CRC table
7586  * @param tmp   scratch register
7587  */
7588 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7589   assert_different_registers(crc, buf, len, table, tmp, rax);
7590 
7591   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7592   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7593 
7594   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7595   // context for the registers used, where all instructions below are using 128-bit mode
7596   // On EVEX without VL and BW, these instructions will all be AVX.
7597   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7598   notl(crc); // ~crc
7599   cmpl(len, 16);
7600   jcc(Assembler::less, L_tail);
7601 
7602   // Align buffer to 16 bytes
7603   movl(tmp, buf);
7604   andl(tmp, 0xF);
7605   jccb(Assembler::zero, L_aligned);
7606   subl(tmp,  16);
7607   addl(len, tmp);
7608 
7609   align(4);
7610   BIND(L_align_loop);
7611   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7612   update_byte_crc32(crc, rax, table);
7613   increment(buf);
7614   incrementl(tmp);
7615   jccb(Assembler::less, L_align_loop);
7616 
7617   BIND(L_aligned);
7618   movl(tmp, len); // save
7619   shrl(len, 4);
7620   jcc(Assembler::zero, L_tail_restore);
7621 
7622   // Fold crc into first bytes of vector
7623   movdqa(xmm1, Address(buf, 0));
7624   movdl(rax, xmm1);
7625   xorl(crc, rax);
7626   if (VM_Version::supports_sse4_1()) {
7627     pinsrd(xmm1, crc, 0);
7628   } else {
7629     pinsrw(xmm1, crc, 0);
7630     shrl(crc, 16);
7631     pinsrw(xmm1, crc, 1);
7632   }
7633   addptr(buf, 16);
7634   subl(len, 4); // len > 0
7635   jcc(Assembler::less, L_fold_tail);
7636 
7637   movdqa(xmm2, Address(buf,  0));
7638   movdqa(xmm3, Address(buf, 16));
7639   movdqa(xmm4, Address(buf, 32));
7640   addptr(buf, 48);
7641   subl(len, 3);
7642   jcc(Assembler::lessEqual, L_fold_512b);
7643 
7644   // Fold total 512 bits of polynomial on each iteration,
7645   // 128 bits per each of 4 parallel streams.
7646   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
7647 
7648   align32();
7649   BIND(L_fold_512b_loop);
7650   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7651   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7652   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7653   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7654   addptr(buf, 64);
7655   subl(len, 4);
7656   jcc(Assembler::greater, L_fold_512b_loop);
7657 
7658   // Fold 512 bits to 128 bits.
7659   BIND(L_fold_512b);
7660   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7661   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7662   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7663   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7664 
7665   // Fold the rest of 128 bits data chunks
7666   BIND(L_fold_tail);
7667   addl(len, 3);
7668   jccb(Assembler::lessEqual, L_fold_128b);
7669   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
7670 
7671   BIND(L_fold_tail_loop);
7672   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7673   addptr(buf, 16);
7674   decrementl(len);
7675   jccb(Assembler::greater, L_fold_tail_loop);
7676 
7677   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7678   BIND(L_fold_128b);
7679   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
7680   if (UseAVX > 0) {
7681     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7682     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7683     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7684   } else {
7685     movdqa(xmm2, xmm0);
7686     pclmulqdq(xmm2, xmm1, 0x1);
7687     movdqa(xmm3, xmm0);
7688     pand(xmm3, xmm2);
7689     pclmulqdq(xmm0, xmm3, 0x1);
7690   }
7691   psrldq(xmm1, 8);
7692   psrldq(xmm2, 4);
7693   pxor(xmm0, xmm1);
7694   pxor(xmm0, xmm2);
7695 
7696   // 8 8-bit folds to compute 32-bit CRC.
7697   for (int j = 0; j < 4; j++) {
7698     fold_8bit_crc32(xmm0, table, xmm1, rax);
7699   }
7700   movdl(crc, xmm0); // mov 32 bits to general register
7701   for (int j = 0; j < 4; j++) {
7702     fold_8bit_crc32(crc, table, rax);
7703   }
7704 
7705   BIND(L_tail_restore);
7706   movl(len, tmp); // restore
7707   BIND(L_tail);
7708   andl(len, 0xf);
7709   jccb(Assembler::zero, L_exit);
7710 
7711   // Fold the rest of bytes
7712   align(4);
7713   BIND(L_tail_loop);
7714   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7715   update_byte_crc32(crc, rax, table);
7716   increment(buf);
7717   decrementl(len);
7718   jccb(Assembler::greater, L_tail_loop);
7719 
7720   BIND(L_exit);
7721   notl(crc); // ~c
7722 }
7723 
7724 #ifdef _LP64
7725 // Helper function for AVX 512 CRC32
7726 // Fold 512-bit data chunks
7727 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7728                                              Register pos, int offset) {
7729   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7730   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7731   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7732   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7733   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7734 }
7735 
7736 // Helper function for AVX 512 CRC32
7737 // Compute CRC32 for < 256B buffers
7738 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7739                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7740                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7741 
7742   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7743   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7744   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7745 
7746   // check if there is enough buffer to be able to fold 16B at a time
7747   cmpl(len, 32);
7748   jcc(Assembler::less, L_less_than_32);
7749 
7750   // if there is, load the constants
7751   movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
7752   movdl(xmm0, crc);                        // get the initial crc value
7753   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7754   pxor(xmm7, xmm0);
7755 
7756   // update the buffer pointer
7757   addl(pos, 16);
7758   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7759   subl(len, 32);
7760   jmp(L_16B_reduction_loop);
7761 
7762   bind(L_less_than_32);
7763   //mov initial crc to the return value. this is necessary for zero - length buffers.
7764   movl(rax, crc);
7765   testl(len, len);
7766   jcc(Assembler::equal, L_cleanup);
7767 
7768   movdl(xmm0, crc);                        //get the initial crc value
7769 
7770   cmpl(len, 16);
7771   jcc(Assembler::equal, L_exact_16_left);
7772   jcc(Assembler::less, L_less_than_16_left);
7773 
7774   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7775   pxor(xmm7, xmm0);                       //xor the initial crc value
7776   addl(pos, 16);
7777   subl(len, 16);
7778   movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
7779   jmp(L_get_last_two_xmms);
7780 
7781   bind(L_less_than_16_left);
7782   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7783   pxor(xmm1, xmm1);
7784   movptr(tmp1, rsp);
7785   movdqu(Address(tmp1, 0 * 16), xmm1);
7786 
7787   cmpl(len, 4);
7788   jcc(Assembler::less, L_only_less_than_4);
7789 
7790   //backup the counter value
7791   movl(tmp2, len);
7792   cmpl(len, 8);
7793   jcc(Assembler::less, L_less_than_8_left);
7794 
7795   //load 8 Bytes
7796   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7797   movq(Address(tmp1, 0 * 16), rax);
7798   addptr(tmp1, 8);
7799   subl(len, 8);
7800   addl(pos, 8);
7801 
7802   bind(L_less_than_8_left);
7803   cmpl(len, 4);
7804   jcc(Assembler::less, L_less_than_4_left);
7805 
7806   //load 4 Bytes
7807   movl(rax, Address(buf, pos, Address::times_1, 0));
7808   movl(Address(tmp1, 0 * 16), rax);
7809   addptr(tmp1, 4);
7810   subl(len, 4);
7811   addl(pos, 4);
7812 
7813   bind(L_less_than_4_left);
7814   cmpl(len, 2);
7815   jcc(Assembler::less, L_less_than_2_left);
7816 
7817   // load 2 Bytes
7818   movw(rax, Address(buf, pos, Address::times_1, 0));
7819   movl(Address(tmp1, 0 * 16), rax);
7820   addptr(tmp1, 2);
7821   subl(len, 2);
7822   addl(pos, 2);
7823 
7824   bind(L_less_than_2_left);
7825   cmpl(len, 1);
7826   jcc(Assembler::less, L_zero_left);
7827 
7828   // load 1 Byte
7829   movb(rax, Address(buf, pos, Address::times_1, 0));
7830   movb(Address(tmp1, 0 * 16), rax);
7831 
7832   bind(L_zero_left);
7833   movdqu(xmm7, Address(rsp, 0));
7834   pxor(xmm7, xmm0);                       //xor the initial crc value
7835 
7836   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7837   movdqu(xmm0, Address(rax, tmp2));
7838   pshufb(xmm7, xmm0);
7839   jmp(L_128_done);
7840 
7841   bind(L_exact_16_left);
7842   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7843   pxor(xmm7, xmm0);                       //xor the initial crc value
7844   jmp(L_128_done);
7845 
7846   bind(L_only_less_than_4);
7847   cmpl(len, 3);
7848   jcc(Assembler::less, L_only_less_than_3);
7849 
7850   // load 3 Bytes
7851   movb(rax, Address(buf, pos, Address::times_1, 0));
7852   movb(Address(tmp1, 0), rax);
7853 
7854   movb(rax, Address(buf, pos, Address::times_1, 1));
7855   movb(Address(tmp1, 1), rax);
7856 
7857   movb(rax, Address(buf, pos, Address::times_1, 2));
7858   movb(Address(tmp1, 2), rax);
7859 
7860   movdqu(xmm7, Address(rsp, 0));
7861   pxor(xmm7, xmm0);                     //xor the initial crc value
7862 
7863   pslldq(xmm7, 0x5);
7864   jmp(L_barrett);
7865   bind(L_only_less_than_3);
7866   cmpl(len, 2);
7867   jcc(Assembler::less, L_only_less_than_2);
7868 
7869   // load 2 Bytes
7870   movb(rax, Address(buf, pos, Address::times_1, 0));
7871   movb(Address(tmp1, 0), rax);
7872 
7873   movb(rax, Address(buf, pos, Address::times_1, 1));
7874   movb(Address(tmp1, 1), rax);
7875 
7876   movdqu(xmm7, Address(rsp, 0));
7877   pxor(xmm7, xmm0);                     //xor the initial crc value
7878 
7879   pslldq(xmm7, 0x6);
7880   jmp(L_barrett);
7881 
7882   bind(L_only_less_than_2);
7883   //load 1 Byte
7884   movb(rax, Address(buf, pos, Address::times_1, 0));
7885   movb(Address(tmp1, 0), rax);
7886 
7887   movdqu(xmm7, Address(rsp, 0));
7888   pxor(xmm7, xmm0);                     //xor the initial crc value
7889 
7890   pslldq(xmm7, 0x7);
7891 }
7892 
7893 /**
7894 * Compute CRC32 using AVX512 instructions
7895 * param crc   register containing existing CRC (32-bit)
7896 * param buf   register pointing to input byte buffer (byte*)
7897 * param len   register containing number of bytes
7898 * param table address of crc or crc32c table
7899 * param tmp1  scratch register
7900 * param tmp2  scratch register
7901 * return rax  result register
7902 *
7903 * This routine is identical for crc32c with the exception of the precomputed constant
7904 * table which will be passed as the table argument.  The calculation steps are
7905 * the same for both variants.
7906 */
7907 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7908   assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7909 
7910   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7911   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7912   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7913   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7914   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7915 
7916   const Register pos = r12;
7917   push(r12);
7918   subptr(rsp, 16 * 2 + 8);
7919 
7920   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7921   // context for the registers used, where all instructions below are using 128-bit mode
7922   // On EVEX without VL and BW, these instructions will all be AVX.
7923   movl(pos, 0);
7924 
7925   // check if smaller than 256B
7926   cmpl(len, 256);
7927   jcc(Assembler::less, L_less_than_256);
7928 
7929   // load the initial crc value
7930   movdl(xmm10, crc);
7931 
7932   // receive the initial 64B data, xor the initial crc value
7933   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7934   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7935   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7936   evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7937 
7938   subl(len, 256);
7939   cmpl(len, 256);
7940   jcc(Assembler::less, L_fold_128_B_loop);
7941 
7942   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7943   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7944   evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7945   subl(len, 256);
7946 
7947   bind(L_fold_256_B_loop);
7948   addl(pos, 256);
7949   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7950   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7951   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7952   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7953 
7954   subl(len, 256);
7955   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7956 
7957   // Fold 256 into 128
7958   addl(pos, 256);
7959   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7960   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7961   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7962 
7963   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7964   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7965   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7966 
7967   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7968   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7969 
7970   addl(len, 128);
7971   jmp(L_fold_128_B_register);
7972 
7973   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7974   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7975 
7976   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7977   bind(L_fold_128_B_loop);
7978   addl(pos, 128);
7979   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7980   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7981 
7982   subl(len, 128);
7983   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7984 
7985   addl(pos, 128);
7986 
7987   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7988   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7989   bind(L_fold_128_B_register);
7990   evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7991   evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7992   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7993   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7994   // save last that has no multiplicand
7995   vextracti64x2(xmm7, xmm4, 3);
7996 
7997   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7998   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7999   // Needed later in reduction loop
8000   movdqu(xmm10, Address(table, 1 * 16));
8001   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
8002   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
8003 
8004   // Swap 1,0,3,2 - 01 00 11 10
8005   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
8006   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
8007   vextracti128(xmm5, xmm8, 1);
8008   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
8009 
8010   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
8011   // instead of a cmp instruction, we use the negative flag with the jl instruction
8012   addl(len, 128 - 16);
8013   jcc(Assembler::less, L_final_reduction_for_128);
8014 
8015   bind(L_16B_reduction_loop);
8016   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8017   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8018   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8019   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
8020   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8021   addl(pos, 16);
8022   subl(len, 16);
8023   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
8024 
8025   bind(L_final_reduction_for_128);
8026   addl(len, 16);
8027   jcc(Assembler::equal, L_128_done);
8028 
8029   bind(L_get_last_two_xmms);
8030   movdqu(xmm2, xmm7);
8031   addl(pos, len);
8032   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
8033   subl(pos, len);
8034 
8035   // get rid of the extra data that was loaded before
8036   // load the shift constant
8037   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8038   movdqu(xmm0, Address(rax, len));
8039   addl(rax, len);
8040 
8041   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8042   //Change mask to 512
8043   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
8044   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
8045 
8046   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
8047   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8048   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8049   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8050   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
8051 
8052   bind(L_128_done);
8053   // compute crc of a 128-bit value
8054   movdqu(xmm10, Address(table, 3 * 16));
8055   movdqu(xmm0, xmm7);
8056 
8057   // 64b fold
8058   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
8059   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
8060   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8061 
8062   // 32b fold
8063   movdqu(xmm0, xmm7);
8064   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
8065   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8066   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8067   jmp(L_barrett);
8068 
8069   bind(L_less_than_256);
8070   kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
8071 
8072   //barrett reduction
8073   bind(L_barrett);
8074   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
8075   movdqu(xmm1, xmm7);
8076   movdqu(xmm2, xmm7);
8077   movdqu(xmm10, Address(table, 4 * 16));
8078 
8079   pclmulqdq(xmm7, xmm10, 0x0);
8080   pxor(xmm7, xmm2);
8081   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
8082   movdqu(xmm2, xmm7);
8083   pclmulqdq(xmm7, xmm10, 0x10);
8084   pxor(xmm7, xmm2);
8085   pxor(xmm7, xmm1);
8086   pextrd(crc, xmm7, 2);
8087 
8088   bind(L_cleanup);
8089   addptr(rsp, 16 * 2 + 8);
8090   pop(r12);
8091 }
8092 
8093 // S. Gueron / Information Processing Letters 112 (2012) 184
8094 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8095 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8096 // Output: the 64-bit carry-less product of B * CONST
8097 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8098                                      Register tmp1, Register tmp2, Register tmp3) {
8099   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8100   if (n > 0) {
8101     addq(tmp3, n * 256 * 8);
8102   }
8103   //    Q1 = TABLEExt[n][B & 0xFF];
8104   movl(tmp1, in);
8105   andl(tmp1, 0x000000FF);
8106   shll(tmp1, 3);
8107   addq(tmp1, tmp3);
8108   movq(tmp1, Address(tmp1, 0));
8109 
8110   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
8111   movl(tmp2, in);
8112   shrl(tmp2, 8);
8113   andl(tmp2, 0x000000FF);
8114   shll(tmp2, 3);
8115   addq(tmp2, tmp3);
8116   movq(tmp2, Address(tmp2, 0));
8117 
8118   shlq(tmp2, 8);
8119   xorq(tmp1, tmp2);
8120 
8121   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
8122   movl(tmp2, in);
8123   shrl(tmp2, 16);
8124   andl(tmp2, 0x000000FF);
8125   shll(tmp2, 3);
8126   addq(tmp2, tmp3);
8127   movq(tmp2, Address(tmp2, 0));
8128 
8129   shlq(tmp2, 16);
8130   xorq(tmp1, tmp2);
8131 
8132   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
8133   shrl(in, 24);
8134   andl(in, 0x000000FF);
8135   shll(in, 3);
8136   addq(in, tmp3);
8137   movq(in, Address(in, 0));
8138 
8139   shlq(in, 24);
8140   xorq(in, tmp1);
8141   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8142 }
8143 
8144 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8145                                       Register in_out,
8146                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8147                                       XMMRegister w_xtmp2,
8148                                       Register tmp1,
8149                                       Register n_tmp2, Register n_tmp3) {
8150   if (is_pclmulqdq_supported) {
8151     movdl(w_xtmp1, in_out); // modified blindly
8152 
8153     movl(tmp1, const_or_pre_comp_const_index);
8154     movdl(w_xtmp2, tmp1);
8155     pclmulqdq(w_xtmp1, w_xtmp2, 0);
8156 
8157     movdq(in_out, w_xtmp1);
8158   } else {
8159     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8160   }
8161 }
8162 
8163 // Recombination Alternative 2: No bit-reflections
8164 // T1 = (CRC_A * U1) << 1
8165 // T2 = (CRC_B * U2) << 1
8166 // C1 = T1 >> 32
8167 // C2 = T2 >> 32
8168 // T1 = T1 & 0xFFFFFFFF
8169 // T2 = T2 & 0xFFFFFFFF
8170 // T1 = CRC32(0, T1)
8171 // T2 = CRC32(0, T2)
8172 // C1 = C1 ^ T1
8173 // C2 = C2 ^ T2
8174 // CRC = C1 ^ C2 ^ CRC_C
8175 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8176                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8177                                      Register tmp1, Register tmp2,
8178                                      Register n_tmp3) {
8179   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8180   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8181   shlq(in_out, 1);
8182   movl(tmp1, in_out);
8183   shrq(in_out, 32);
8184   xorl(tmp2, tmp2);
8185   crc32(tmp2, tmp1, 4);
8186   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8187   shlq(in1, 1);
8188   movl(tmp1, in1);
8189   shrq(in1, 32);
8190   xorl(tmp2, tmp2);
8191   crc32(tmp2, tmp1, 4);
8192   xorl(in1, tmp2);
8193   xorl(in_out, in1);
8194   xorl(in_out, in2);
8195 }
8196 
8197 // Set N to predefined value
8198 // Subtract from a length of a buffer
8199 // execute in a loop:
8200 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8201 // for i = 1 to N do
8202 //  CRC_A = CRC32(CRC_A, A[i])
8203 //  CRC_B = CRC32(CRC_B, B[i])
8204 //  CRC_C = CRC32(CRC_C, C[i])
8205 // end for
8206 // Recombine
8207 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8208                                        Register in_out1, Register in_out2, Register in_out3,
8209                                        Register tmp1, Register tmp2, Register tmp3,
8210                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8211                                        Register tmp4, Register tmp5,
8212                                        Register n_tmp6) {
8213   Label L_processPartitions;
8214   Label L_processPartition;
8215   Label L_exit;
8216 
8217   bind(L_processPartitions);
8218   cmpl(in_out1, 3 * size);
8219   jcc(Assembler::less, L_exit);
8220     xorl(tmp1, tmp1);
8221     xorl(tmp2, tmp2);
8222     movq(tmp3, in_out2);
8223     addq(tmp3, size);
8224 
8225     bind(L_processPartition);
8226       crc32(in_out3, Address(in_out2, 0), 8);
8227       crc32(tmp1, Address(in_out2, size), 8);
8228       crc32(tmp2, Address(in_out2, size * 2), 8);
8229       addq(in_out2, 8);
8230       cmpq(in_out2, tmp3);
8231       jcc(Assembler::less, L_processPartition);
8232     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8233             w_xtmp1, w_xtmp2, w_xtmp3,
8234             tmp4, tmp5,
8235             n_tmp6);
8236     addq(in_out2, 2 * size);
8237     subl(in_out1, 3 * size);
8238     jmp(L_processPartitions);
8239 
8240   bind(L_exit);
8241 }
8242 #else
8243 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
8244                                      Register tmp1, Register tmp2, Register tmp3,
8245                                      XMMRegister xtmp1, XMMRegister xtmp2) {
8246   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8247   if (n > 0) {
8248     addl(tmp3, n * 256 * 8);
8249   }
8250   //    Q1 = TABLEExt[n][B & 0xFF];
8251   movl(tmp1, in_out);
8252   andl(tmp1, 0x000000FF);
8253   shll(tmp1, 3);
8254   addl(tmp1, tmp3);
8255   movq(xtmp1, Address(tmp1, 0));
8256 
8257   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
8258   movl(tmp2, in_out);
8259   shrl(tmp2, 8);
8260   andl(tmp2, 0x000000FF);
8261   shll(tmp2, 3);
8262   addl(tmp2, tmp3);
8263   movq(xtmp2, Address(tmp2, 0));
8264 
8265   psllq(xtmp2, 8);
8266   pxor(xtmp1, xtmp2);
8267 
8268   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
8269   movl(tmp2, in_out);
8270   shrl(tmp2, 16);
8271   andl(tmp2, 0x000000FF);
8272   shll(tmp2, 3);
8273   addl(tmp2, tmp3);
8274   movq(xtmp2, Address(tmp2, 0));
8275 
8276   psllq(xtmp2, 16);
8277   pxor(xtmp1, xtmp2);
8278 
8279   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
8280   shrl(in_out, 24);
8281   andl(in_out, 0x000000FF);
8282   shll(in_out, 3);
8283   addl(in_out, tmp3);
8284   movq(xtmp2, Address(in_out, 0));
8285 
8286   psllq(xtmp2, 24);
8287   pxor(xtmp1, xtmp2); // Result in CXMM
8288   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8289 }
8290 
8291 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8292                                       Register in_out,
8293                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8294                                       XMMRegister w_xtmp2,
8295                                       Register tmp1,
8296                                       Register n_tmp2, Register n_tmp3) {
8297   if (is_pclmulqdq_supported) {
8298     movdl(w_xtmp1, in_out);
8299 
8300     movl(tmp1, const_or_pre_comp_const_index);
8301     movdl(w_xtmp2, tmp1);
8302     pclmulqdq(w_xtmp1, w_xtmp2, 0);
8303     // Keep result in XMM since GPR is 32 bit in length
8304   } else {
8305     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
8306   }
8307 }
8308 
8309 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8310                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8311                                      Register tmp1, Register tmp2,
8312                                      Register n_tmp3) {
8313   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8314   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8315 
8316   psllq(w_xtmp1, 1);
8317   movdl(tmp1, w_xtmp1);
8318   psrlq(w_xtmp1, 32);
8319   movdl(in_out, w_xtmp1);
8320 
8321   xorl(tmp2, tmp2);
8322   crc32(tmp2, tmp1, 4);
8323   xorl(in_out, tmp2);
8324 
8325   psllq(w_xtmp2, 1);
8326   movdl(tmp1, w_xtmp2);
8327   psrlq(w_xtmp2, 32);
8328   movdl(in1, w_xtmp2);
8329 
8330   xorl(tmp2, tmp2);
8331   crc32(tmp2, tmp1, 4);
8332   xorl(in1, tmp2);
8333   xorl(in_out, in1);
8334   xorl(in_out, in2);
8335 }
8336 
8337 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8338                                        Register in_out1, Register in_out2, Register in_out3,
8339                                        Register tmp1, Register tmp2, Register tmp3,
8340                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8341                                        Register tmp4, Register tmp5,
8342                                        Register n_tmp6) {
8343   Label L_processPartitions;
8344   Label L_processPartition;
8345   Label L_exit;
8346 
8347   bind(L_processPartitions);
8348   cmpl(in_out1, 3 * size);
8349   jcc(Assembler::less, L_exit);
8350     xorl(tmp1, tmp1);
8351     xorl(tmp2, tmp2);
8352     movl(tmp3, in_out2);
8353     addl(tmp3, size);
8354 
8355     bind(L_processPartition);
8356       crc32(in_out3, Address(in_out2, 0), 4);
8357       crc32(tmp1, Address(in_out2, size), 4);
8358       crc32(tmp2, Address(in_out2, size*2), 4);
8359       crc32(in_out3, Address(in_out2, 0+4), 4);
8360       crc32(tmp1, Address(in_out2, size+4), 4);
8361       crc32(tmp2, Address(in_out2, size*2+4), 4);
8362       addl(in_out2, 8);
8363       cmpl(in_out2, tmp3);
8364       jcc(Assembler::less, L_processPartition);
8365 
8366         push(tmp3);
8367         push(in_out1);
8368         push(in_out2);
8369         tmp4 = tmp3;
8370         tmp5 = in_out1;
8371         n_tmp6 = in_out2;
8372 
8373       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8374             w_xtmp1, w_xtmp2, w_xtmp3,
8375             tmp4, tmp5,
8376             n_tmp6);
8377 
8378         pop(in_out2);
8379         pop(in_out1);
8380         pop(tmp3);
8381 
8382     addl(in_out2, 2 * size);
8383     subl(in_out1, 3 * size);
8384     jmp(L_processPartitions);
8385 
8386   bind(L_exit);
8387 }
8388 #endif //LP64
8389 
8390 #ifdef _LP64
8391 // Algorithm 2: Pipelined usage of the CRC32 instruction.
8392 // Input: A buffer I of L bytes.
8393 // Output: the CRC32C value of the buffer.
8394 // Notations:
8395 // Write L = 24N + r, with N = floor (L/24).
8396 // r = L mod 24 (0 <= r < 24).
8397 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8398 // N quadwords, and R consists of r bytes.
8399 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8400 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8401 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8402 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8403 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8404                                           Register tmp1, Register tmp2, Register tmp3,
8405                                           Register tmp4, Register tmp5, Register tmp6,
8406                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8407                                           bool is_pclmulqdq_supported) {
8408   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8409   Label L_wordByWord;
8410   Label L_byteByByteProlog;
8411   Label L_byteByByte;
8412   Label L_exit;
8413 
8414   if (is_pclmulqdq_supported ) {
8415     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8416     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
8417 
8418     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8419     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8420 
8421     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8422     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8423     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8424   } else {
8425     const_or_pre_comp_const_index[0] = 1;
8426     const_or_pre_comp_const_index[1] = 0;
8427 
8428     const_or_pre_comp_const_index[2] = 3;
8429     const_or_pre_comp_const_index[3] = 2;
8430 
8431     const_or_pre_comp_const_index[4] = 5;
8432     const_or_pre_comp_const_index[5] = 4;
8433    }
8434   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8435                     in2, in1, in_out,
8436                     tmp1, tmp2, tmp3,
8437                     w_xtmp1, w_xtmp2, w_xtmp3,
8438                     tmp4, tmp5,
8439                     tmp6);
8440   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8441                     in2, in1, in_out,
8442                     tmp1, tmp2, tmp3,
8443                     w_xtmp1, w_xtmp2, w_xtmp3,
8444                     tmp4, tmp5,
8445                     tmp6);
8446   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8447                     in2, in1, in_out,
8448                     tmp1, tmp2, tmp3,
8449                     w_xtmp1, w_xtmp2, w_xtmp3,
8450                     tmp4, tmp5,
8451                     tmp6);
8452   movl(tmp1, in2);
8453   andl(tmp1, 0x00000007);
8454   negl(tmp1);
8455   addl(tmp1, in2);
8456   addq(tmp1, in1);
8457 
8458   cmpq(in1, tmp1);
8459   jccb(Assembler::greaterEqual, L_byteByByteProlog);
8460   align(16);
8461   BIND(L_wordByWord);
8462     crc32(in_out, Address(in1, 0), 8);
8463     addq(in1, 8);
8464     cmpq(in1, tmp1);
8465     jcc(Assembler::less, L_wordByWord);
8466 
8467   BIND(L_byteByByteProlog);
8468   andl(in2, 0x00000007);
8469   movl(tmp2, 1);
8470 
8471   cmpl(tmp2, in2);
8472   jccb(Assembler::greater, L_exit);
8473   BIND(L_byteByByte);
8474     crc32(in_out, Address(in1, 0), 1);
8475     incq(in1);
8476     incl(tmp2);
8477     cmpl(tmp2, in2);
8478     jcc(Assembler::lessEqual, L_byteByByte);
8479 
8480   BIND(L_exit);
8481 }
8482 #else
8483 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8484                                           Register tmp1, Register  tmp2, Register tmp3,
8485                                           Register tmp4, Register  tmp5, Register tmp6,
8486                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8487                                           bool is_pclmulqdq_supported) {
8488   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8489   Label L_wordByWord;
8490   Label L_byteByByteProlog;
8491   Label L_byteByByte;
8492   Label L_exit;
8493 
8494   if (is_pclmulqdq_supported) {
8495     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8496     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
8497 
8498     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8499     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8500 
8501     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8502     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8503   } else {
8504     const_or_pre_comp_const_index[0] = 1;
8505     const_or_pre_comp_const_index[1] = 0;
8506 
8507     const_or_pre_comp_const_index[2] = 3;
8508     const_or_pre_comp_const_index[3] = 2;
8509 
8510     const_or_pre_comp_const_index[4] = 5;
8511     const_or_pre_comp_const_index[5] = 4;
8512   }
8513   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8514                     in2, in1, in_out,
8515                     tmp1, tmp2, tmp3,
8516                     w_xtmp1, w_xtmp2, w_xtmp3,
8517                     tmp4, tmp5,
8518                     tmp6);
8519   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8520                     in2, in1, in_out,
8521                     tmp1, tmp2, tmp3,
8522                     w_xtmp1, w_xtmp2, w_xtmp3,
8523                     tmp4, tmp5,
8524                     tmp6);
8525   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8526                     in2, in1, in_out,
8527                     tmp1, tmp2, tmp3,
8528                     w_xtmp1, w_xtmp2, w_xtmp3,
8529                     tmp4, tmp5,
8530                     tmp6);
8531   movl(tmp1, in2);
8532   andl(tmp1, 0x00000007);
8533   negl(tmp1);
8534   addl(tmp1, in2);
8535   addl(tmp1, in1);
8536 
8537   BIND(L_wordByWord);
8538   cmpl(in1, tmp1);
8539   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8540     crc32(in_out, Address(in1,0), 4);
8541     addl(in1, 4);
8542     jmp(L_wordByWord);
8543 
8544   BIND(L_byteByByteProlog);
8545   andl(in2, 0x00000007);
8546   movl(tmp2, 1);
8547 
8548   BIND(L_byteByByte);
8549   cmpl(tmp2, in2);
8550   jccb(Assembler::greater, L_exit);
8551     movb(tmp1, Address(in1, 0));
8552     crc32(in_out, tmp1, 1);
8553     incl(in1);
8554     incl(tmp2);
8555     jmp(L_byteByByte);
8556 
8557   BIND(L_exit);
8558 }
8559 #endif // LP64
8560 #undef BIND
8561 #undef BLOCK_COMMENT
8562 
8563 // Compress char[] array to byte[].
8564 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8565 //   @IntrinsicCandidate
8566 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8567 //     for (int i = 0; i < len; i++) {
8568 //       int c = src[srcOff++];
8569 //       if (c >>> 8 != 0) {
8570 //         return 0;
8571 //       }
8572 //       dst[dstOff++] = (byte)c;
8573 //     }
8574 //     return len;
8575 //   }
8576 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8577   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8578   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8579   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8580   Label copy_chars_loop, return_length, return_zero, done;
8581 
8582   // rsi: src
8583   // rdi: dst
8584   // rdx: len
8585   // rcx: tmp5
8586   // rax: result
8587 
8588   // rsi holds start addr of source char[] to be compressed
8589   // rdi holds start addr of destination byte[]
8590   // rdx holds length
8591 
8592   assert(len != result, "");
8593 
8594   // save length for return
8595   push(len);
8596 
8597   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8598     VM_Version::supports_avx512vlbw() &&
8599     VM_Version::supports_bmi2()) {
8600 
8601     Label copy_32_loop, copy_loop_tail, below_threshold;
8602 
8603     // alignment
8604     Label post_alignment;
8605 
8606     // if length of the string is less than 16, handle it in an old fashioned way
8607     testl(len, -32);
8608     jcc(Assembler::zero, below_threshold);
8609 
8610     // First check whether a character is compressible ( <= 0xFF).
8611     // Create mask to test for Unicode chars inside zmm vector
8612     movl(result, 0x00FF);
8613     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8614 
8615     testl(len, -64);
8616     jcc(Assembler::zero, post_alignment);
8617 
8618     movl(tmp5, dst);
8619     andl(tmp5, (32 - 1));
8620     negl(tmp5);
8621     andl(tmp5, (32 - 1));
8622 
8623     // bail out when there is nothing to be done
8624     testl(tmp5, 0xFFFFFFFF);
8625     jcc(Assembler::zero, post_alignment);
8626 
8627     // ~(~0 << len), where len is the # of remaining elements to process
8628     movl(result, 0xFFFFFFFF);
8629     shlxl(result, result, tmp5);
8630     notl(result);
8631     kmovdl(mask2, result);
8632 
8633     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8634     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8635     ktestd(mask1, mask2);
8636     jcc(Assembler::carryClear, return_zero);
8637 
8638     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8639 
8640     addptr(src, tmp5);
8641     addptr(src, tmp5);
8642     addptr(dst, tmp5);
8643     subl(len, tmp5);
8644 
8645     bind(post_alignment);
8646     // end of alignment
8647 
8648     movl(tmp5, len);
8649     andl(tmp5, (32 - 1));    // tail count (in chars)
8650     andl(len, ~(32 - 1));    // vector count (in chars)
8651     jcc(Assembler::zero, copy_loop_tail);
8652 
8653     lea(src, Address(src, len, Address::times_2));
8654     lea(dst, Address(dst, len, Address::times_1));
8655     negptr(len);
8656 
8657     bind(copy_32_loop);
8658     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
8659     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8660     kortestdl(mask1, mask1);
8661     jcc(Assembler::carryClear, return_zero);
8662 
8663     // All elements in current processed chunk are valid candidates for
8664     // compression. Write a truncated byte elements to the memory.
8665     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8666     addptr(len, 32);
8667     jcc(Assembler::notZero, copy_32_loop);
8668 
8669     bind(copy_loop_tail);
8670     // bail out when there is nothing to be done
8671     testl(tmp5, 0xFFFFFFFF);
8672     jcc(Assembler::zero, return_length);
8673 
8674     movl(len, tmp5);
8675 
8676     // ~(~0 << len), where len is the # of remaining elements to process
8677     movl(result, 0xFFFFFFFF);
8678     shlxl(result, result, len);
8679     notl(result);
8680 
8681     kmovdl(mask2, result);
8682 
8683     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8684     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8685     ktestd(mask1, mask2);
8686     jcc(Assembler::carryClear, return_zero);
8687 
8688     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8689     jmp(return_length);
8690 
8691     bind(below_threshold);
8692   }
8693 
8694   if (UseSSE42Intrinsics) {
8695     Label copy_32_loop, copy_16, copy_tail;
8696 
8697     movl(result, len);
8698 
8699     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8700 
8701     // vectored compression
8702     andl(len, 0xfffffff0);    // vector count (in chars)
8703     andl(result, 0x0000000f);    // tail count (in chars)
8704     testl(len, len);
8705     jcc(Assembler::zero, copy_16);
8706 
8707     // compress 16 chars per iter
8708     movdl(tmp1Reg, tmp5);
8709     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8710     pxor(tmp4Reg, tmp4Reg);
8711 
8712     lea(src, Address(src, len, Address::times_2));
8713     lea(dst, Address(dst, len, Address::times_1));
8714     negptr(len);
8715 
8716     bind(copy_32_loop);
8717     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8718     por(tmp4Reg, tmp2Reg);
8719     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8720     por(tmp4Reg, tmp3Reg);
8721     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8722     jcc(Assembler::notZero, return_zero);
8723     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8724     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8725     addptr(len, 16);
8726     jcc(Assembler::notZero, copy_32_loop);
8727 
8728     // compress next vector of 8 chars (if any)
8729     bind(copy_16);
8730     movl(len, result);
8731     andl(len, 0xfffffff8);    // vector count (in chars)
8732     andl(result, 0x00000007);    // tail count (in chars)
8733     testl(len, len);
8734     jccb(Assembler::zero, copy_tail);
8735 
8736     movdl(tmp1Reg, tmp5);
8737     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8738     pxor(tmp3Reg, tmp3Reg);
8739 
8740     movdqu(tmp2Reg, Address(src, 0));
8741     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8742     jccb(Assembler::notZero, return_zero);
8743     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8744     movq(Address(dst, 0), tmp2Reg);
8745     addptr(src, 16);
8746     addptr(dst, 8);
8747 
8748     bind(copy_tail);
8749     movl(len, result);
8750   }
8751   // compress 1 char per iter
8752   testl(len, len);
8753   jccb(Assembler::zero, return_length);
8754   lea(src, Address(src, len, Address::times_2));
8755   lea(dst, Address(dst, len, Address::times_1));
8756   negptr(len);
8757 
8758   bind(copy_chars_loop);
8759   load_unsigned_short(result, Address(src, len, Address::times_2));
8760   testl(result, 0xff00);      // check if Unicode char
8761   jccb(Assembler::notZero, return_zero);
8762   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8763   increment(len);
8764   jcc(Assembler::notZero, copy_chars_loop);
8765 
8766   // if compression succeeded, return length
8767   bind(return_length);
8768   pop(result);
8769   jmpb(done);
8770 
8771   // if compression failed, return 0
8772   bind(return_zero);
8773   xorl(result, result);
8774   addptr(rsp, wordSize);
8775 
8776   bind(done);
8777 }
8778 
8779 // Inflate byte[] array to char[].
8780 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8781 //   @IntrinsicCandidate
8782 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8783 //     for (int i = 0; i < len; i++) {
8784 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8785 //     }
8786 //   }
8787 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8788   XMMRegister tmp1, Register tmp2, KRegister mask) {
8789   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8790   // rsi: src
8791   // rdi: dst
8792   // rdx: len
8793   // rcx: tmp2
8794 
8795   // rsi holds start addr of source byte[] to be inflated
8796   // rdi holds start addr of destination char[]
8797   // rdx holds length
8798   assert_different_registers(src, dst, len, tmp2);
8799   movl(tmp2, len);
8800   if ((UseAVX > 2) && // AVX512
8801     VM_Version::supports_avx512vlbw() &&
8802     VM_Version::supports_bmi2()) {
8803 
8804     Label copy_32_loop, copy_tail;
8805     Register tmp3_aliased = len;
8806 
8807     // if length of the string is less than 16, handle it in an old fashioned way
8808     testl(len, -16);
8809     jcc(Assembler::zero, below_threshold);
8810 
8811     testl(len, -1 * AVX3Threshold);
8812     jcc(Assembler::zero, avx3_threshold);
8813 
8814     // In order to use only one arithmetic operation for the main loop we use
8815     // this pre-calculation
8816     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8817     andl(len, -32);     // vector count
8818     jccb(Assembler::zero, copy_tail);
8819 
8820     lea(src, Address(src, len, Address::times_1));
8821     lea(dst, Address(dst, len, Address::times_2));
8822     negptr(len);
8823 
8824 
8825     // inflate 32 chars per iter
8826     bind(copy_32_loop);
8827     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8828     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
8829     addptr(len, 32);
8830     jcc(Assembler::notZero, copy_32_loop);
8831 
8832     bind(copy_tail);
8833     // bail out when there is nothing to be done
8834     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8835     jcc(Assembler::zero, done);
8836 
8837     // ~(~0 << length), where length is the # of remaining elements to process
8838     movl(tmp3_aliased, -1);
8839     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8840     notl(tmp3_aliased);
8841     kmovdl(mask, tmp3_aliased);
8842     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8843     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8844 
8845     jmp(done);
8846     bind(avx3_threshold);
8847   }
8848   if (UseSSE42Intrinsics) {
8849     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8850 
8851     if (UseAVX > 1) {
8852       andl(tmp2, (16 - 1));
8853       andl(len, -16);
8854       jccb(Assembler::zero, copy_new_tail);
8855     } else {
8856       andl(tmp2, 0x00000007);   // tail count (in chars)
8857       andl(len, 0xfffffff8);    // vector count (in chars)
8858       jccb(Assembler::zero, copy_tail);
8859     }
8860 
8861     // vectored inflation
8862     lea(src, Address(src, len, Address::times_1));
8863     lea(dst, Address(dst, len, Address::times_2));
8864     negptr(len);
8865 
8866     if (UseAVX > 1) {
8867       bind(copy_16_loop);
8868       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8869       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8870       addptr(len, 16);
8871       jcc(Assembler::notZero, copy_16_loop);
8872 
8873       bind(below_threshold);
8874       bind(copy_new_tail);
8875       movl(len, tmp2);
8876       andl(tmp2, 0x00000007);
8877       andl(len, 0xFFFFFFF8);
8878       jccb(Assembler::zero, copy_tail);
8879 
8880       pmovzxbw(tmp1, Address(src, 0));
8881       movdqu(Address(dst, 0), tmp1);
8882       addptr(src, 8);
8883       addptr(dst, 2 * 8);
8884 
8885       jmp(copy_tail, true);
8886     }
8887 
8888     // inflate 8 chars per iter
8889     bind(copy_8_loop);
8890     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8891     movdqu(Address(dst, len, Address::times_2), tmp1);
8892     addptr(len, 8);
8893     jcc(Assembler::notZero, copy_8_loop);
8894 
8895     bind(copy_tail);
8896     movl(len, tmp2);
8897 
8898     cmpl(len, 4);
8899     jccb(Assembler::less, copy_bytes);
8900 
8901     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8902     pmovzxbw(tmp1, tmp1);
8903     movq(Address(dst, 0), tmp1);
8904     subptr(len, 4);
8905     addptr(src, 4);
8906     addptr(dst, 8);
8907 
8908     bind(copy_bytes);
8909   } else {
8910     bind(below_threshold);
8911   }
8912 
8913   testl(len, len);
8914   jccb(Assembler::zero, done);
8915   lea(src, Address(src, len, Address::times_1));
8916   lea(dst, Address(dst, len, Address::times_2));
8917   negptr(len);
8918 
8919   // inflate 1 char per iter
8920   bind(copy_chars_loop);
8921   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8922   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8923   increment(len);
8924   jcc(Assembler::notZero, copy_chars_loop);
8925 
8926   bind(done);
8927 }
8928 
8929 
8930 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
8931   switch(type) {
8932     case T_BYTE:
8933     case T_BOOLEAN:
8934       evmovdqub(dst, kmask, src, merge, vector_len);
8935       break;
8936     case T_CHAR:
8937     case T_SHORT:
8938       evmovdquw(dst, kmask, src, merge, vector_len);
8939       break;
8940     case T_INT:
8941     case T_FLOAT:
8942       evmovdqul(dst, kmask, src, merge, vector_len);
8943       break;
8944     case T_LONG:
8945     case T_DOUBLE:
8946       evmovdquq(dst, kmask, src, merge, vector_len);
8947       break;
8948     default:
8949       fatal("Unexpected type argument %s", type2name(type));
8950       break;
8951   }
8952 }
8953 
8954 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
8955   switch(type) {
8956     case T_BYTE:
8957     case T_BOOLEAN:
8958       evmovdqub(dst, kmask, src, merge, vector_len);
8959       break;
8960     case T_CHAR:
8961     case T_SHORT:
8962       evmovdquw(dst, kmask, src, merge, vector_len);
8963       break;
8964     case T_INT:
8965     case T_FLOAT:
8966       evmovdqul(dst, kmask, src, merge, vector_len);
8967       break;
8968     case T_LONG:
8969     case T_DOUBLE:
8970       evmovdquq(dst, kmask, src, merge, vector_len);
8971       break;
8972     default:
8973       fatal("Unexpected type argument %s", type2name(type));
8974       break;
8975   }
8976 }
8977 
8978 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8979   switch(masklen) {
8980     case 2:
8981        knotbl(dst, src);
8982        movl(rtmp, 3);
8983        kmovbl(ktmp, rtmp);
8984        kandbl(dst, ktmp, dst);
8985        break;
8986     case 4:
8987        knotbl(dst, src);
8988        movl(rtmp, 15);
8989        kmovbl(ktmp, rtmp);
8990        kandbl(dst, ktmp, dst);
8991        break;
8992     case 8:
8993        knotbl(dst, src);
8994        break;
8995     case 16:
8996        knotwl(dst, src);
8997        break;
8998     case 32:
8999        knotdl(dst, src);
9000        break;
9001     case 64:
9002        knotql(dst, src);
9003        break;
9004     default:
9005       fatal("Unexpected vector length %d", masklen);
9006       break;
9007   }
9008 }
9009 
9010 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9011   switch(type) {
9012     case T_BOOLEAN:
9013     case T_BYTE:
9014        kandbl(dst, src1, src2);
9015        break;
9016     case T_CHAR:
9017     case T_SHORT:
9018        kandwl(dst, src1, src2);
9019        break;
9020     case T_INT:
9021     case T_FLOAT:
9022        kanddl(dst, src1, src2);
9023        break;
9024     case T_LONG:
9025     case T_DOUBLE:
9026        kandql(dst, src1, src2);
9027        break;
9028     default:
9029       fatal("Unexpected type argument %s", type2name(type));
9030       break;
9031   }
9032 }
9033 
9034 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9035   switch(type) {
9036     case T_BOOLEAN:
9037     case T_BYTE:
9038        korbl(dst, src1, src2);
9039        break;
9040     case T_CHAR:
9041     case T_SHORT:
9042        korwl(dst, src1, src2);
9043        break;
9044     case T_INT:
9045     case T_FLOAT:
9046        kordl(dst, src1, src2);
9047        break;
9048     case T_LONG:
9049     case T_DOUBLE:
9050        korql(dst, src1, src2);
9051        break;
9052     default:
9053       fatal("Unexpected type argument %s", type2name(type));
9054       break;
9055   }
9056 }
9057 
9058 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9059   switch(type) {
9060     case T_BOOLEAN:
9061     case T_BYTE:
9062        kxorbl(dst, src1, src2);
9063        break;
9064     case T_CHAR:
9065     case T_SHORT:
9066        kxorwl(dst, src1, src2);
9067        break;
9068     case T_INT:
9069     case T_FLOAT:
9070        kxordl(dst, src1, src2);
9071        break;
9072     case T_LONG:
9073     case T_DOUBLE:
9074        kxorql(dst, src1, src2);
9075        break;
9076     default:
9077       fatal("Unexpected type argument %s", type2name(type));
9078       break;
9079   }
9080 }
9081 
9082 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9083   switch(type) {
9084     case T_BOOLEAN:
9085     case T_BYTE:
9086       evpermb(dst, mask, nds, src, merge, vector_len); break;
9087     case T_CHAR:
9088     case T_SHORT:
9089       evpermw(dst, mask, nds, src, merge, vector_len); break;
9090     case T_INT:
9091     case T_FLOAT:
9092       evpermd(dst, mask, nds, src, merge, vector_len); break;
9093     case T_LONG:
9094     case T_DOUBLE:
9095       evpermq(dst, mask, nds, src, merge, vector_len); break;
9096     default:
9097       fatal("Unexpected type argument %s", type2name(type)); break;
9098   }
9099 }
9100 
9101 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9102   switch(type) {
9103     case T_BOOLEAN:
9104     case T_BYTE:
9105       evpermb(dst, mask, nds, src, merge, vector_len); break;
9106     case T_CHAR:
9107     case T_SHORT:
9108       evpermw(dst, mask, nds, src, merge, vector_len); break;
9109     case T_INT:
9110     case T_FLOAT:
9111       evpermd(dst, mask, nds, src, merge, vector_len); break;
9112     case T_LONG:
9113     case T_DOUBLE:
9114       evpermq(dst, mask, nds, src, merge, vector_len); break;
9115     default:
9116       fatal("Unexpected type argument %s", type2name(type)); break;
9117   }
9118 }
9119 
9120 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9121   switch(type) {
9122     case T_BYTE:
9123       evpminsb(dst, mask, nds, src, merge, vector_len); break;
9124     case T_SHORT:
9125       evpminsw(dst, mask, nds, src, merge, vector_len); break;
9126     case T_INT:
9127       evpminsd(dst, mask, nds, src, merge, vector_len); break;
9128     case T_LONG:
9129       evpminsq(dst, mask, nds, src, merge, vector_len); break;
9130     default:
9131       fatal("Unexpected type argument %s", type2name(type)); break;
9132   }
9133 }
9134 
9135 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9136   switch(type) {
9137     case T_BYTE:
9138       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9139     case T_SHORT:
9140       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9141     case T_INT:
9142       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9143     case T_LONG:
9144       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9145     default:
9146       fatal("Unexpected type argument %s", type2name(type)); break;
9147   }
9148 }
9149 
9150 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9151   switch(type) {
9152     case T_BYTE:
9153       evpminsb(dst, mask, nds, src, merge, vector_len); break;
9154     case T_SHORT:
9155       evpminsw(dst, mask, nds, src, merge, vector_len); break;
9156     case T_INT:
9157       evpminsd(dst, mask, nds, src, merge, vector_len); break;
9158     case T_LONG:
9159       evpminsq(dst, mask, nds, src, merge, vector_len); break;
9160     default:
9161       fatal("Unexpected type argument %s", type2name(type)); break;
9162   }
9163 }
9164 
9165 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9166   switch(type) {
9167     case T_BYTE:
9168       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9169     case T_SHORT:
9170       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9171     case T_INT:
9172       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9173     case T_LONG:
9174       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9175     default:
9176       fatal("Unexpected type argument %s", type2name(type)); break;
9177   }
9178 }
9179 
9180 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9181   switch(type) {
9182     case T_INT:
9183       evpxord(dst, mask, nds, src, merge, vector_len); break;
9184     case T_LONG:
9185       evpxorq(dst, mask, nds, src, merge, vector_len); break;
9186     default:
9187       fatal("Unexpected type argument %s", type2name(type)); break;
9188   }
9189 }
9190 
9191 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9192   switch(type) {
9193     case T_INT:
9194       evpxord(dst, mask, nds, src, merge, vector_len); break;
9195     case T_LONG:
9196       evpxorq(dst, mask, nds, src, merge, vector_len); break;
9197     default:
9198       fatal("Unexpected type argument %s", type2name(type)); break;
9199   }
9200 }
9201 
9202 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9203   switch(type) {
9204     case T_INT:
9205       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9206     case T_LONG:
9207       evporq(dst, mask, nds, src, merge, vector_len); break;
9208     default:
9209       fatal("Unexpected type argument %s", type2name(type)); break;
9210   }
9211 }
9212 
9213 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9214   switch(type) {
9215     case T_INT:
9216       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9217     case T_LONG:
9218       evporq(dst, mask, nds, src, merge, vector_len); break;
9219     default:
9220       fatal("Unexpected type argument %s", type2name(type)); break;
9221   }
9222 }
9223 
9224 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9225   switch(type) {
9226     case T_INT:
9227       evpandd(dst, mask, nds, src, merge, vector_len); break;
9228     case T_LONG:
9229       evpandq(dst, mask, nds, src, merge, vector_len); break;
9230     default:
9231       fatal("Unexpected type argument %s", type2name(type)); break;
9232   }
9233 }
9234 
9235 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9236   switch(type) {
9237     case T_INT:
9238       evpandd(dst, mask, nds, src, merge, vector_len); break;
9239     case T_LONG:
9240       evpandq(dst, mask, nds, src, merge, vector_len); break;
9241     default:
9242       fatal("Unexpected type argument %s", type2name(type)); break;
9243   }
9244 }
9245 
9246 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
9247   switch(masklen) {
9248     case 8:
9249        kortestbl(src1, src2);
9250        break;
9251     case 16:
9252        kortestwl(src1, src2);
9253        break;
9254     case 32:
9255        kortestdl(src1, src2);
9256        break;
9257     case 64:
9258        kortestql(src1, src2);
9259        break;
9260     default:
9261       fatal("Unexpected mask length %d", masklen);
9262       break;
9263   }
9264 }
9265 
9266 
9267 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9268   switch(masklen)  {
9269     case 8:
9270        ktestbl(src1, src2);
9271        break;
9272     case 16:
9273        ktestwl(src1, src2);
9274        break;
9275     case 32:
9276        ktestdl(src1, src2);
9277        break;
9278     case 64:
9279        ktestql(src1, src2);
9280        break;
9281     default:
9282       fatal("Unexpected mask length %d", masklen);
9283       break;
9284   }
9285 }
9286 
9287 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9288   switch(type) {
9289     case T_INT:
9290       evprold(dst, mask, src, shift, merge, vlen_enc); break;
9291     case T_LONG:
9292       evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9293     default:
9294       fatal("Unexpected type argument %s", type2name(type)); break;
9295       break;
9296   }
9297 }
9298 
9299 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9300   switch(type) {
9301     case T_INT:
9302       evprord(dst, mask, src, shift, merge, vlen_enc); break;
9303     case T_LONG:
9304       evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9305     default:
9306       fatal("Unexpected type argument %s", type2name(type)); break;
9307   }
9308 }
9309 
9310 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9311   switch(type) {
9312     case T_INT:
9313       evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9314     case T_LONG:
9315       evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9316     default:
9317       fatal("Unexpected type argument %s", type2name(type)); break;
9318   }
9319 }
9320 
9321 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9322   switch(type) {
9323     case T_INT:
9324       evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9325     case T_LONG:
9326       evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9327     default:
9328       fatal("Unexpected type argument %s", type2name(type)); break;
9329   }
9330 }
9331 
9332 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9333   assert(rscratch != noreg || always_reachable(src), "missing");
9334 
9335   if (reachable(src)) {
9336     evpandq(dst, nds, as_Address(src), vector_len);
9337   } else {
9338     lea(rscratch, src);
9339     evpandq(dst, nds, Address(rscratch, 0), vector_len);
9340   }
9341 }
9342 
9343 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9344   assert(rscratch != noreg || always_reachable(src), "missing");
9345 
9346   if (reachable(src)) {
9347     Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9348   } else {
9349     lea(rscratch, src);
9350     Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9351   }
9352 }
9353 
9354 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9355   assert(rscratch != noreg || always_reachable(src), "missing");
9356 
9357   if (reachable(src)) {
9358     evporq(dst, nds, as_Address(src), vector_len);
9359   } else {
9360     lea(rscratch, src);
9361     evporq(dst, nds, Address(rscratch, 0), vector_len);
9362   }
9363 }
9364 
9365 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
9366   assert(rscratch != noreg || always_reachable(src3), "missing");
9367 
9368   if (reachable(src3)) {
9369     vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
9370   } else {
9371     lea(rscratch, src3);
9372     vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
9373   }
9374 }
9375 
9376 #if COMPILER2_OR_JVMCI
9377 
9378 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9379                                  Register length, Register temp, int vec_enc) {
9380   // Computing mask for predicated vector store.
9381   movptr(temp, -1);
9382   bzhiq(temp, temp, length);
9383   kmov(mask, temp);
9384   evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9385 }
9386 
9387 // Set memory operation for length "less than" 64 bytes.
9388 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9389                                        XMMRegister xmm, KRegister mask, Register length,
9390                                        Register temp, bool use64byteVector) {
9391   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9392   const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9393   if (!use64byteVector) {
9394     fill32(dst, disp, xmm);
9395     subptr(length, 32 >> shift);
9396     fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9397   } else {
9398     assert(MaxVectorSize == 64, "vector length != 64");
9399     fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9400   }
9401 }
9402 
9403 
9404 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9405                                        XMMRegister xmm, KRegister mask, Register length,
9406                                        Register temp) {
9407   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9408   const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9409   fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9410 }
9411 
9412 
9413 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9414   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9415   vmovdqu(dst, xmm);
9416 }
9417 
9418 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9419   fill32(Address(dst, disp), xmm);
9420 }
9421 
9422 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9423   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9424   if (!use64byteVector) {
9425     fill32(dst, xmm);
9426     fill32(dst.plus_disp(32), xmm);
9427   } else {
9428     evmovdquq(dst, xmm, Assembler::AVX_512bit);
9429   }
9430 }
9431 
9432 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9433   fill64(Address(dst, disp), xmm, use64byteVector);
9434 }
9435 
9436 #ifdef _LP64
9437 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9438                                         Register count, Register rtmp, XMMRegister xtmp) {
9439   Label L_exit;
9440   Label L_fill_start;
9441   Label L_fill_64_bytes;
9442   Label L_fill_96_bytes;
9443   Label L_fill_128_bytes;
9444   Label L_fill_128_bytes_loop;
9445   Label L_fill_128_loop_header;
9446   Label L_fill_128_bytes_loop_header;
9447   Label L_fill_128_bytes_loop_pre_header;
9448   Label L_fill_zmm_sequence;
9449 
9450   int shift = -1;
9451   int avx3threshold = VM_Version::avx3_threshold();
9452   switch(type) {
9453     case T_BYTE:  shift = 0;
9454       break;
9455     case T_SHORT: shift = 1;
9456       break;
9457     case T_INT:   shift = 2;
9458       break;
9459     /* Uncomment when LONG fill stubs are supported.
9460     case T_LONG:  shift = 3;
9461       break;
9462     */
9463     default:
9464       fatal("Unhandled type: %s\n", type2name(type));
9465   }
9466 
9467   if ((avx3threshold != 0)  || (MaxVectorSize == 32)) {
9468 
9469     if (MaxVectorSize == 64) {
9470       cmpq(count, avx3threshold >> shift);
9471       jcc(Assembler::greater, L_fill_zmm_sequence);
9472     }
9473 
9474     evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
9475 
9476     bind(L_fill_start);
9477 
9478     cmpq(count, 32 >> shift);
9479     jccb(Assembler::greater, L_fill_64_bytes);
9480     fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
9481     jmp(L_exit);
9482 
9483     bind(L_fill_64_bytes);
9484     cmpq(count, 64 >> shift);
9485     jccb(Assembler::greater, L_fill_96_bytes);
9486     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
9487     jmp(L_exit);
9488 
9489     bind(L_fill_96_bytes);
9490     cmpq(count, 96 >> shift);
9491     jccb(Assembler::greater, L_fill_128_bytes);
9492     fill64(to, 0, xtmp);
9493     subq(count, 64 >> shift);
9494     fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
9495     jmp(L_exit);
9496 
9497     bind(L_fill_128_bytes);
9498     cmpq(count, 128 >> shift);
9499     jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
9500     fill64(to, 0, xtmp);
9501     fill32(to, 64, xtmp);
9502     subq(count, 96 >> shift);
9503     fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
9504     jmp(L_exit);
9505 
9506     bind(L_fill_128_bytes_loop_pre_header);
9507     {
9508       mov(rtmp, to);
9509       andq(rtmp, 31);
9510       jccb(Assembler::zero, L_fill_128_bytes_loop_header);
9511       negq(rtmp);
9512       addq(rtmp, 32);
9513       mov64(r8, -1L);
9514       bzhiq(r8, r8, rtmp);
9515       kmovql(k2, r8);
9516       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
9517       addq(to, rtmp);
9518       shrq(rtmp, shift);
9519       subq(count, rtmp);
9520     }
9521 
9522     cmpq(count, 128 >> shift);
9523     jcc(Assembler::less, L_fill_start);
9524 
9525     bind(L_fill_128_bytes_loop_header);
9526     subq(count, 128 >> shift);
9527 
9528     align32();
9529     bind(L_fill_128_bytes_loop);
9530       fill64(to, 0, xtmp);
9531       fill64(to, 64, xtmp);
9532       addq(to, 128);
9533       subq(count, 128 >> shift);
9534       jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9535 
9536     addq(count, 128 >> shift);
9537     jcc(Assembler::zero, L_exit);
9538     jmp(L_fill_start);
9539   }
9540 
9541   if (MaxVectorSize == 64) {
9542     // Sequence using 64 byte ZMM register.
9543     Label L_fill_128_bytes_zmm;
9544     Label L_fill_192_bytes_zmm;
9545     Label L_fill_192_bytes_loop_zmm;
9546     Label L_fill_192_bytes_loop_header_zmm;
9547     Label L_fill_192_bytes_loop_pre_header_zmm;
9548     Label L_fill_start_zmm_sequence;
9549 
9550     bind(L_fill_zmm_sequence);
9551     evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9552 
9553     bind(L_fill_start_zmm_sequence);
9554     cmpq(count, 64 >> shift);
9555     jccb(Assembler::greater, L_fill_128_bytes_zmm);
9556     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9557     jmp(L_exit);
9558 
9559     bind(L_fill_128_bytes_zmm);
9560     cmpq(count, 128 >> shift);
9561     jccb(Assembler::greater, L_fill_192_bytes_zmm);
9562     fill64(to, 0, xtmp, true);
9563     subq(count, 64 >> shift);
9564     fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9565     jmp(L_exit);
9566 
9567     bind(L_fill_192_bytes_zmm);
9568     cmpq(count, 192 >> shift);
9569     jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9570     fill64(to, 0, xtmp, true);
9571     fill64(to, 64, xtmp, true);
9572     subq(count, 128 >> shift);
9573     fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9574     jmp(L_exit);
9575 
9576     bind(L_fill_192_bytes_loop_pre_header_zmm);
9577     {
9578       movq(rtmp, to);
9579       andq(rtmp, 63);
9580       jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9581       negq(rtmp);
9582       addq(rtmp, 64);
9583       mov64(r8, -1L);
9584       bzhiq(r8, r8, rtmp);
9585       kmovql(k2, r8);
9586       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9587       addq(to, rtmp);
9588       shrq(rtmp, shift);
9589       subq(count, rtmp);
9590     }
9591 
9592     cmpq(count, 192 >> shift);
9593     jcc(Assembler::less, L_fill_start_zmm_sequence);
9594 
9595     bind(L_fill_192_bytes_loop_header_zmm);
9596     subq(count, 192 >> shift);
9597 
9598     align32();
9599     bind(L_fill_192_bytes_loop_zmm);
9600       fill64(to, 0, xtmp, true);
9601       fill64(to, 64, xtmp, true);
9602       fill64(to, 128, xtmp, true);
9603       addq(to, 192);
9604       subq(count, 192 >> shift);
9605       jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9606 
9607     addq(count, 192 >> shift);
9608     jcc(Assembler::zero, L_exit);
9609     jmp(L_fill_start_zmm_sequence);
9610   }
9611   bind(L_exit);
9612 }
9613 #endif
9614 #endif //COMPILER2_OR_JVMCI
9615 
9616 
9617 #ifdef _LP64
9618 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9619   Label done;
9620   cvttss2sil(dst, src);
9621   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9622   cmpl(dst, 0x80000000); // float_sign_flip
9623   jccb(Assembler::notEqual, done);
9624   subptr(rsp, 8);
9625   movflt(Address(rsp, 0), src);
9626   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9627   pop(dst);
9628   bind(done);
9629 }
9630 
9631 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9632   Label done;
9633   cvttsd2sil(dst, src);
9634   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9635   cmpl(dst, 0x80000000); // float_sign_flip
9636   jccb(Assembler::notEqual, done);
9637   subptr(rsp, 8);
9638   movdbl(Address(rsp, 0), src);
9639   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9640   pop(dst);
9641   bind(done);
9642 }
9643 
9644 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9645   Label done;
9646   cvttss2siq(dst, src);
9647   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9648   jccb(Assembler::notEqual, done);
9649   subptr(rsp, 8);
9650   movflt(Address(rsp, 0), src);
9651   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9652   pop(dst);
9653   bind(done);
9654 }
9655 
9656 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9657   // Following code is line by line assembly translation rounding algorithm.
9658   // Please refer to java.lang.Math.round(float) algorithm for details.
9659   const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
9660   const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
9661   const int32_t FloatConsts_EXP_BIAS = 127;
9662   const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
9663   const int32_t MINUS_32 = 0xFFFFFFE0;
9664   Label L_special_case, L_block1, L_exit;
9665   movl(rtmp, FloatConsts_EXP_BIT_MASK);
9666   movdl(dst, src);
9667   andl(dst, rtmp);
9668   sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
9669   movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
9670   subl(rtmp, dst);
9671   movl(rcx, rtmp);
9672   movl(dst, MINUS_32);
9673   testl(rtmp, dst);
9674   jccb(Assembler::notEqual, L_special_case);
9675   movdl(dst, src);
9676   andl(dst, FloatConsts_SIGNIF_BIT_MASK);
9677   orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
9678   movdl(rtmp, src);
9679   testl(rtmp, rtmp);
9680   jccb(Assembler::greaterEqual, L_block1);
9681   negl(dst);
9682   bind(L_block1);
9683   sarl(dst);
9684   addl(dst, 0x1);
9685   sarl(dst, 0x1);
9686   jmp(L_exit);
9687   bind(L_special_case);
9688   convert_f2i(dst, src);
9689   bind(L_exit);
9690 }
9691 
9692 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9693   // Following code is line by line assembly translation rounding algorithm.
9694   // Please refer to java.lang.Math.round(double) algorithm for details.
9695   const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
9696   const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
9697   const int64_t DoubleConsts_EXP_BIAS = 1023;
9698   const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
9699   const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
9700   Label L_special_case, L_block1, L_exit;
9701   mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
9702   movq(dst, src);
9703   andq(dst, rtmp);
9704   sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
9705   mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
9706   subq(rtmp, dst);
9707   movq(rcx, rtmp);
9708   mov64(dst, MINUS_64);
9709   testq(rtmp, dst);
9710   jccb(Assembler::notEqual, L_special_case);
9711   movq(dst, src);
9712   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
9713   andq(dst, rtmp);
9714   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
9715   orq(dst, rtmp);
9716   movq(rtmp, src);
9717   testq(rtmp, rtmp);
9718   jccb(Assembler::greaterEqual, L_block1);
9719   negq(dst);
9720   bind(L_block1);
9721   sarq(dst);
9722   addq(dst, 0x1);
9723   sarq(dst, 0x1);
9724   jmp(L_exit);
9725   bind(L_special_case);
9726   convert_d2l(dst, src);
9727   bind(L_exit);
9728 }
9729 
9730 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9731   Label done;
9732   cvttsd2siq(dst, src);
9733   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9734   jccb(Assembler::notEqual, done);
9735   subptr(rsp, 8);
9736   movdbl(Address(rsp, 0), src);
9737   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9738   pop(dst);
9739   bind(done);
9740 }
9741 
9742 void MacroAssembler::cache_wb(Address line)
9743 {
9744   // 64 bit cpus always support clflush
9745   assert(VM_Version::supports_clflush(), "clflush should be available");
9746   bool optimized = VM_Version::supports_clflushopt();
9747   bool no_evict = VM_Version::supports_clwb();
9748 
9749   // prefer clwb (writeback without evict) otherwise
9750   // prefer clflushopt (potentially parallel writeback with evict)
9751   // otherwise fallback on clflush (serial writeback with evict)
9752 
9753   if (optimized) {
9754     if (no_evict) {
9755       clwb(line);
9756     } else {
9757       clflushopt(line);
9758     }
9759   } else {
9760     // no need for fence when using CLFLUSH
9761     clflush(line);
9762   }
9763 }
9764 
9765 void MacroAssembler::cache_wbsync(bool is_pre)
9766 {
9767   assert(VM_Version::supports_clflush(), "clflush should be available");
9768   bool optimized = VM_Version::supports_clflushopt();
9769   bool no_evict = VM_Version::supports_clwb();
9770 
9771   // pick the correct implementation
9772 
9773   if (!is_pre && (optimized || no_evict)) {
9774     // need an sfence for post flush when using clflushopt or clwb
9775     // otherwise no no need for any synchroniaztion
9776 
9777     sfence();
9778   }
9779 }
9780 
9781 #endif // _LP64
9782 
9783 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9784   switch (cond) {
9785     // Note some conditions are synonyms for others
9786     case Assembler::zero:         return Assembler::notZero;
9787     case Assembler::notZero:      return Assembler::zero;
9788     case Assembler::less:         return Assembler::greaterEqual;
9789     case Assembler::lessEqual:    return Assembler::greater;
9790     case Assembler::greater:      return Assembler::lessEqual;
9791     case Assembler::greaterEqual: return Assembler::less;
9792     case Assembler::below:        return Assembler::aboveEqual;
9793     case Assembler::belowEqual:   return Assembler::above;
9794     case Assembler::above:        return Assembler::belowEqual;
9795     case Assembler::aboveEqual:   return Assembler::below;
9796     case Assembler::overflow:     return Assembler::noOverflow;
9797     case Assembler::noOverflow:   return Assembler::overflow;
9798     case Assembler::negative:     return Assembler::positive;
9799     case Assembler::positive:     return Assembler::negative;
9800     case Assembler::parity:       return Assembler::noParity;
9801     case Assembler::noParity:     return Assembler::parity;
9802   }
9803   ShouldNotReachHere(); return Assembler::overflow;
9804 }
9805 
9806 SkipIfEqual::SkipIfEqual(
9807     MacroAssembler* masm, const bool* flag_addr, bool value, Register rscratch) {
9808   _masm = masm;
9809   _masm->cmp8(ExternalAddress((address)flag_addr), value, rscratch);
9810   _masm->jcc(Assembler::equal, _label);
9811 }
9812 
9813 SkipIfEqual::~SkipIfEqual() {
9814   _masm->bind(_label);
9815 }
9816 
9817 // 32-bit Windows has its own fast-path implementation
9818 // of get_thread
9819 #if !defined(WIN32) || defined(_LP64)
9820 
9821 // This is simply a call to Thread::current()
9822 void MacroAssembler::get_thread(Register thread) {
9823   if (thread != rax) {
9824     push(rax);
9825   }
9826   LP64_ONLY(push(rdi);)
9827   LP64_ONLY(push(rsi);)
9828   push(rdx);
9829   push(rcx);
9830 #ifdef _LP64
9831   push(r8);
9832   push(r9);
9833   push(r10);
9834   push(r11);
9835 #endif
9836 
9837   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9838 
9839 #ifdef _LP64
9840   pop(r11);
9841   pop(r10);
9842   pop(r9);
9843   pop(r8);
9844 #endif
9845   pop(rcx);
9846   pop(rdx);
9847   LP64_ONLY(pop(rsi);)
9848   LP64_ONLY(pop(rdi);)
9849   if (thread != rax) {
9850     mov(thread, rax);
9851     pop(rax);
9852   }
9853 }
9854 
9855 
9856 #endif // !WIN32 || _LP64
9857 
9858 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
9859   Label L_stack_ok;
9860   if (bias == 0) {
9861     testptr(sp, 2 * wordSize - 1);
9862   } else {
9863     // lea(tmp, Address(rsp, bias);
9864     mov(tmp, sp);
9865     addptr(tmp, bias);
9866     testptr(tmp, 2 * wordSize - 1);
9867   }
9868   jcc(Assembler::equal, L_stack_ok);
9869   block_comment(msg);
9870   stop(msg);
9871   bind(L_stack_ok);
9872 }
9873 
9874 // Implements lightweight-locking.
9875 //
9876 // obj: the object to be locked
9877 // reg_rax: rax
9878 // thread: the thread which attempts to lock obj
9879 // tmp: a temporary register
9880 void MacroAssembler::lightweight_lock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) {
9881   assert(reg_rax == rax, "");
9882   assert_different_registers(obj, reg_rax, thread, tmp);
9883 
9884   Label push;
9885   const Register top = tmp;
9886 
9887   // Preload the markWord. It is important that this is the first
9888   // instruction emitted as it is part of C1's null check semantics.
9889   movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
9890 
9891   // Load top.
9892   movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9893 
9894   // Check if the lock-stack is full.
9895   cmpl(top, LockStack::end_offset());
9896   jcc(Assembler::greaterEqual, slow);
9897 
9898   // Check for recursion.
9899   cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
9900   jcc(Assembler::equal, push);
9901 
9902   // Check header for monitor (0b10).
9903   testptr(reg_rax, markWord::monitor_value);
9904   jcc(Assembler::notZero, slow);
9905 
9906   // Try to lock. Transition lock bits 0b01 => 0b00
9907   movptr(tmp, reg_rax);
9908   andptr(tmp, ~(int32_t)markWord::unlocked_value);
9909   orptr(reg_rax, markWord::unlocked_value);
9910   lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
9911   jcc(Assembler::notEqual, slow);
9912 
9913   // Restore top, CAS clobbers register.
9914   movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9915 
9916   bind(push);
9917   // After successful lock, push object on lock-stack.
9918   movptr(Address(thread, top), obj);
9919   incrementl(top, oopSize);
9920   movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
9921 }
9922 
9923 // Implements lightweight-unlocking.
9924 //
9925 // obj: the object to be unlocked
9926 // reg_rax: rax
9927 // thread: the thread
9928 // tmp: a temporary register
9929 //
9930 // x86_32 Note: reg_rax and thread may alias each other due to limited register
9931 //              availiability.
9932 void MacroAssembler::lightweight_unlock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) {
9933   assert(reg_rax == rax, "");
9934   assert_different_registers(obj, reg_rax, tmp);
9935   LP64_ONLY(assert_different_registers(obj, reg_rax, thread, tmp);)
9936 
9937   Label unlocked, push_and_slow;
9938   const Register top = tmp;
9939 
9940   // Check if obj is top of lock-stack.
9941   movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9942   cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
9943   jcc(Assembler::notEqual, slow);
9944 
9945   // Pop lock-stack.
9946   DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
9947   subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
9948 
9949   // Check if recursive.
9950   cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
9951   jcc(Assembler::equal, unlocked);
9952 
9953   // Not recursive. Check header for monitor (0b10).
9954   movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
9955   testptr(reg_rax, markWord::monitor_value);
9956   jcc(Assembler::notZero, push_and_slow);
9957 
9958 #ifdef ASSERT
9959   // Check header not unlocked (0b01).
9960   Label not_unlocked;
9961   testptr(reg_rax, markWord::unlocked_value);
9962   jcc(Assembler::zero, not_unlocked);
9963   stop("lightweight_unlock already unlocked");
9964   bind(not_unlocked);
9965 #endif
9966 
9967   // Try to unlock. Transition lock bits 0b00 => 0b01
9968   movptr(tmp, reg_rax);
9969   orptr(tmp, markWord::unlocked_value);
9970   lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
9971   jcc(Assembler::equal, unlocked);
9972 
9973   bind(push_and_slow);
9974   // Restore lock-stack and handle the unlock in runtime.
9975   if (thread == reg_rax) {
9976     // On x86_32 we may lose the thread.
9977     get_thread(thread);
9978   }
9979 #ifdef ASSERT
9980   movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
9981   movptr(Address(thread, top), obj);
9982 #endif
9983   addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
9984   jmp(slow);
9985 
9986   bind(unlocked);
9987 }