1 /*
   2  * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/compiler_globals.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "logging/log.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedKlass.inline.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "prims/methodHandles.hpp"
  45 #include "runtime/continuation.hpp"
  46 #include "runtime/flags/flagSetting.hpp"
  47 #include "runtime/interfaceSupport.inline.hpp"
  48 #include "runtime/javaThread.hpp"
  49 #include "runtime/jniHandles.hpp"
  50 #include "runtime/objectMonitor.hpp"
  51 #include "runtime/os.hpp"
  52 #include "runtime/safepoint.hpp"
  53 #include "runtime/safepointMechanism.hpp"
  54 #include "runtime/sharedRuntime.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "utilities/macros.hpp"
  57 #include "crc32c.h"
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 #ifdef ASSERT
  70 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  71 #endif
  72 
  73 static Assembler::Condition reverse[] = {
  74     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  75     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  76     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  77     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  78     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  79     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  80     Assembler::above          /* belowEqual    = 0x6 */ ,
  81     Assembler::belowEqual     /* above         = 0x7 */ ,
  82     Assembler::positive       /* negative      = 0x8 */ ,
  83     Assembler::negative       /* positive      = 0x9 */ ,
  84     Assembler::noParity       /* parity        = 0xa */ ,
  85     Assembler::parity         /* noParity      = 0xb */ ,
  86     Assembler::greaterEqual   /* less          = 0xc */ ,
  87     Assembler::less           /* greaterEqual  = 0xd */ ,
  88     Assembler::greater        /* lessEqual     = 0xe */ ,
  89     Assembler::lessEqual      /* greater       = 0xf, */
  90 
  91 };
  92 
  93 
  94 // Implementation of MacroAssembler
  95 
  96 // First all the versions that have distinct versions depending on 32/64 bit
  97 // Unless the difference is trivial (1 line or so).
  98 
  99 #ifndef _LP64
 100 
 101 // 32bit versions
 102 
 103 Address MacroAssembler::as_Address(AddressLiteral adr) {
 104   return Address(adr.target(), adr.rspec());
 105 }
 106 
 107 Address MacroAssembler::as_Address(ArrayAddress adr) {
 108   return Address::make_array(adr);
 109 }
 110 
 111 void MacroAssembler::call_VM_leaf_base(address entry_point,
 112                                        int number_of_arguments) {
 113   call(RuntimeAddress(entry_point));
 114   increment(rsp, number_of_arguments * wordSize);
 115 }
 116 
 117 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 118   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 119 }
 120 
 121 
 122 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 123   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 124 }
 125 
 126 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 127   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 128 }
 129 
 130 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 131   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 132 }
 133 
 134 void MacroAssembler::extend_sign(Register hi, Register lo) {
 135   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 136   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 137     cdql();
 138   } else {
 139     movl(hi, lo);
 140     sarl(hi, 31);
 141   }
 142 }
 143 
 144 void MacroAssembler::jC2(Register tmp, Label& L) {
 145   // set parity bit if FPU flag C2 is set (via rax)
 146   save_rax(tmp);
 147   fwait(); fnstsw_ax();
 148   sahf();
 149   restore_rax(tmp);
 150   // branch
 151   jcc(Assembler::parity, L);
 152 }
 153 
 154 void MacroAssembler::jnC2(Register tmp, Label& L) {
 155   // set parity bit if FPU flag C2 is set (via rax)
 156   save_rax(tmp);
 157   fwait(); fnstsw_ax();
 158   sahf();
 159   restore_rax(tmp);
 160   // branch
 161   jcc(Assembler::noParity, L);
 162 }
 163 
 164 // 32bit can do a case table jump in one instruction but we no longer allow the base
 165 // to be installed in the Address class
 166 void MacroAssembler::jump(ArrayAddress entry) {
 167   jmp(as_Address(entry));
 168 }
 169 
 170 // Note: y_lo will be destroyed
 171 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 172   // Long compare for Java (semantics as described in JVM spec.)
 173   Label high, low, done;
 174 
 175   cmpl(x_hi, y_hi);
 176   jcc(Assembler::less, low);
 177   jcc(Assembler::greater, high);
 178   // x_hi is the return register
 179   xorl(x_hi, x_hi);
 180   cmpl(x_lo, y_lo);
 181   jcc(Assembler::below, low);
 182   jcc(Assembler::equal, done);
 183 
 184   bind(high);
 185   xorl(x_hi, x_hi);
 186   increment(x_hi);
 187   jmp(done);
 188 
 189   bind(low);
 190   xorl(x_hi, x_hi);
 191   decrementl(x_hi);
 192 
 193   bind(done);
 194 }
 195 
 196 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 197     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 198 }
 199 
 200 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 201   // leal(dst, as_Address(adr));
 202   // see note in movl as to why we must use a move
 203   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 204 }
 205 
 206 void MacroAssembler::leave() {
 207   mov(rsp, rbp);
 208   pop(rbp);
 209 }
 210 
 211 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 212   // Multiplication of two Java long values stored on the stack
 213   // as illustrated below. Result is in rdx:rax.
 214   //
 215   // rsp ---> [  ??  ] \               \
 216   //            ....    | y_rsp_offset  |
 217   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 218   //          [ y_hi ]                  | (in bytes)
 219   //            ....                    |
 220   //          [ x_lo ]                 /
 221   //          [ x_hi ]
 222   //            ....
 223   //
 224   // Basic idea: lo(result) = lo(x_lo * y_lo)
 225   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 226   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 227   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 228   Label quick;
 229   // load x_hi, y_hi and check if quick
 230   // multiplication is possible
 231   movl(rbx, x_hi);
 232   movl(rcx, y_hi);
 233   movl(rax, rbx);
 234   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 235   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 236   // do full multiplication
 237   // 1st step
 238   mull(y_lo);                                    // x_hi * y_lo
 239   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 240   // 2nd step
 241   movl(rax, x_lo);
 242   mull(rcx);                                     // x_lo * y_hi
 243   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 244   // 3rd step
 245   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 246   movl(rax, x_lo);
 247   mull(y_lo);                                    // x_lo * y_lo
 248   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 249 }
 250 
 251 void MacroAssembler::lneg(Register hi, Register lo) {
 252   negl(lo);
 253   adcl(hi, 0);
 254   negl(hi);
 255 }
 256 
 257 void MacroAssembler::lshl(Register hi, Register lo) {
 258   // Java shift left long support (semantics as described in JVM spec., p.305)
 259   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 260   // shift value is in rcx !
 261   assert(hi != rcx, "must not use rcx");
 262   assert(lo != rcx, "must not use rcx");
 263   const Register s = rcx;                        // shift count
 264   const int      n = BitsPerWord;
 265   Label L;
 266   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 267   cmpl(s, n);                                    // if (s < n)
 268   jcc(Assembler::less, L);                       // else (s >= n)
 269   movl(hi, lo);                                  // x := x << n
 270   xorl(lo, lo);
 271   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 272   bind(L);                                       // s (mod n) < n
 273   shldl(hi, lo);                                 // x := x << s
 274   shll(lo);
 275 }
 276 
 277 
 278 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 279   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 280   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 281   assert(hi != rcx, "must not use rcx");
 282   assert(lo != rcx, "must not use rcx");
 283   const Register s = rcx;                        // shift count
 284   const int      n = BitsPerWord;
 285   Label L;
 286   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 287   cmpl(s, n);                                    // if (s < n)
 288   jcc(Assembler::less, L);                       // else (s >= n)
 289   movl(lo, hi);                                  // x := x >> n
 290   if (sign_extension) sarl(hi, 31);
 291   else                xorl(hi, hi);
 292   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 293   bind(L);                                       // s (mod n) < n
 294   shrdl(lo, hi);                                 // x := x >> s
 295   if (sign_extension) sarl(hi);
 296   else                shrl(hi);
 297 }
 298 
 299 void MacroAssembler::movoop(Register dst, jobject obj) {
 300   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 301 }
 302 
 303 void MacroAssembler::movoop(Address dst, jobject obj) {
 304   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 305 }
 306 
 307 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 308   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 309 }
 310 
 311 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 312   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 313 }
 314 
 315 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 316   // scratch register is not used,
 317   // it is defined to match parameters of 64-bit version of this method.
 318   if (src.is_lval()) {
 319     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 320   } else {
 321     movl(dst, as_Address(src));
 322   }
 323 }
 324 
 325 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 326   movl(as_Address(dst), src);
 327 }
 328 
 329 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 330   movl(dst, as_Address(src));
 331 }
 332 
 333 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 334 void MacroAssembler::movptr(Address dst, intptr_t src) {
 335   movl(dst, src);
 336 }
 337 
 338 void MacroAssembler::pushoop(jobject obj) {
 339   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 340 }
 341 
 342 void MacroAssembler::pushklass(Metadata* obj) {
 343   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 344 }
 345 
 346 void MacroAssembler::pushptr(AddressLiteral src) {
 347   if (src.is_lval()) {
 348     push_literal32((int32_t)src.target(), src.rspec());
 349   } else {
 350     pushl(as_Address(src));
 351   }
 352 }
 353 
 354 static void pass_arg0(MacroAssembler* masm, Register arg) {
 355   masm->push(arg);
 356 }
 357 
 358 static void pass_arg1(MacroAssembler* masm, Register arg) {
 359   masm->push(arg);
 360 }
 361 
 362 static void pass_arg2(MacroAssembler* masm, Register arg) {
 363   masm->push(arg);
 364 }
 365 
 366 static void pass_arg3(MacroAssembler* masm, Register arg) {
 367   masm->push(arg);
 368 }
 369 
 370 #ifndef PRODUCT
 371 extern "C" void findpc(intptr_t x);
 372 #endif
 373 
 374 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 375   // In order to get locks to work, we need to fake a in_VM state
 376   JavaThread* thread = JavaThread::current();
 377   JavaThreadState saved_state = thread->thread_state();
 378   thread->set_thread_state(_thread_in_vm);
 379   if (ShowMessageBoxOnError) {
 380     JavaThread* thread = JavaThread::current();
 381     JavaThreadState saved_state = thread->thread_state();
 382     thread->set_thread_state(_thread_in_vm);
 383     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 384       ttyLocker ttyl;
 385       BytecodeCounter::print();
 386     }
 387     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 388     // This is the value of eip which points to where verify_oop will return.
 389     if (os::message_box(msg, "Execution stopped, print registers?")) {
 390       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 391       BREAKPOINT;
 392     }
 393   }
 394   fatal("DEBUG MESSAGE: %s", msg);
 395 }
 396 
 397 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 398   ttyLocker ttyl;
 399   FlagSetting fs(Debugging, true);
 400   tty->print_cr("eip = 0x%08x", eip);
 401 #ifndef PRODUCT
 402   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 403     tty->cr();
 404     findpc(eip);
 405     tty->cr();
 406   }
 407 #endif
 408 #define PRINT_REG(rax) \
 409   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 410   PRINT_REG(rax);
 411   PRINT_REG(rbx);
 412   PRINT_REG(rcx);
 413   PRINT_REG(rdx);
 414   PRINT_REG(rdi);
 415   PRINT_REG(rsi);
 416   PRINT_REG(rbp);
 417   PRINT_REG(rsp);
 418 #undef PRINT_REG
 419   // Print some words near top of staack.
 420   int* dump_sp = (int*) rsp;
 421   for (int col1 = 0; col1 < 8; col1++) {
 422     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 423     os::print_location(tty, *dump_sp++);
 424   }
 425   for (int row = 0; row < 16; row++) {
 426     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 427     for (int col = 0; col < 8; col++) {
 428       tty->print(" 0x%08x", *dump_sp++);
 429     }
 430     tty->cr();
 431   }
 432   // Print some instructions around pc:
 433   Disassembler::decode((address)eip-64, (address)eip);
 434   tty->print_cr("--------");
 435   Disassembler::decode((address)eip, (address)eip+32);
 436 }
 437 
 438 void MacroAssembler::stop(const char* msg) {
 439   ExternalAddress message((address)msg);
 440   // push address of message
 441   pushptr(message.addr());
 442   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 443   pusha();                                            // push registers
 444   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 445   hlt();
 446 }
 447 
 448 void MacroAssembler::warn(const char* msg) {
 449   push_CPU_state();
 450 
 451   ExternalAddress message((address) msg);
 452   // push address of message
 453   pushptr(message.addr());
 454 
 455   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 456   addl(rsp, wordSize);       // discard argument
 457   pop_CPU_state();
 458 }
 459 
 460 void MacroAssembler::print_state() {
 461   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 462   pusha();                                            // push registers
 463 
 464   push_CPU_state();
 465   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 466   pop_CPU_state();
 467 
 468   popa();
 469   addl(rsp, wordSize);
 470 }
 471 
 472 #else // _LP64
 473 
 474 // 64 bit versions
 475 
 476 Address MacroAssembler::as_Address(AddressLiteral adr) {
 477   // amd64 always does this as a pc-rel
 478   // we can be absolute or disp based on the instruction type
 479   // jmp/call are displacements others are absolute
 480   assert(!adr.is_lval(), "must be rval");
 481   assert(reachable(adr), "must be");
 482   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 483 
 484 }
 485 
 486 Address MacroAssembler::as_Address(ArrayAddress adr) {
 487   AddressLiteral base = adr.base();
 488   lea(rscratch1, base);
 489   Address index = adr.index();
 490   assert(index._disp == 0, "must not have disp"); // maybe it can?
 491   Address array(rscratch1, index._index, index._scale, index._disp);
 492   return array;
 493 }
 494 
 495 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 496   Label L, E;
 497 
 498 #ifdef _WIN64
 499   // Windows always allocates space for it's register args
 500   assert(num_args <= 4, "only register arguments supported");
 501   subq(rsp,  frame::arg_reg_save_area_bytes);
 502 #endif
 503 
 504   // Align stack if necessary
 505   testl(rsp, 15);
 506   jcc(Assembler::zero, L);
 507 
 508   subq(rsp, 8);
 509   call(RuntimeAddress(entry_point));
 510   addq(rsp, 8);
 511   jmp(E);
 512 
 513   bind(L);
 514   call(RuntimeAddress(entry_point));
 515 
 516   bind(E);
 517 
 518 #ifdef _WIN64
 519   // restore stack pointer
 520   addq(rsp, frame::arg_reg_save_area_bytes);
 521 #endif
 522 
 523 }
 524 
 525 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 526   assert(!src2.is_lval(), "should use cmpptr");
 527 
 528   if (reachable(src2)) {
 529     cmpq(src1, as_Address(src2));
 530   } else {
 531     lea(rscratch1, src2);
 532     Assembler::cmpq(src1, Address(rscratch1, 0));
 533   }
 534 }
 535 
 536 int MacroAssembler::corrected_idivq(Register reg) {
 537   // Full implementation of Java ldiv and lrem; checks for special
 538   // case as described in JVM spec., p.243 & p.271.  The function
 539   // returns the (pc) offset of the idivl instruction - may be needed
 540   // for implicit exceptions.
 541   //
 542   //         normal case                           special case
 543   //
 544   // input : rax: dividend                         min_long
 545   //         reg: divisor   (may not be eax/edx)   -1
 546   //
 547   // output: rax: quotient  (= rax idiv reg)       min_long
 548   //         rdx: remainder (= rax irem reg)       0
 549   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 550   static const int64_t min_long = 0x8000000000000000;
 551   Label normal_case, special_case;
 552 
 553   // check for special case
 554   cmp64(rax, ExternalAddress((address) &min_long));
 555   jcc(Assembler::notEqual, normal_case);
 556   xorl(rdx, rdx); // prepare rdx for possible special case (where
 557                   // remainder = 0)
 558   cmpq(reg, -1);
 559   jcc(Assembler::equal, special_case);
 560 
 561   // handle normal case
 562   bind(normal_case);
 563   cdqq();
 564   int idivq_offset = offset();
 565   idivq(reg);
 566 
 567   // normal and special case exit
 568   bind(special_case);
 569 
 570   return idivq_offset;
 571 }
 572 
 573 void MacroAssembler::decrementq(Register reg, int value) {
 574   if (value == min_jint) { subq(reg, value); return; }
 575   if (value <  0) { incrementq(reg, -value); return; }
 576   if (value == 0) {                        ; return; }
 577   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 578   /* else */      { subq(reg, value)       ; return; }
 579 }
 580 
 581 void MacroAssembler::decrementq(Address dst, int value) {
 582   if (value == min_jint) { subq(dst, value); return; }
 583   if (value <  0) { incrementq(dst, -value); return; }
 584   if (value == 0) {                        ; return; }
 585   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 586   /* else */      { subq(dst, value)       ; return; }
 587 }
 588 
 589 void MacroAssembler::incrementq(AddressLiteral dst) {
 590   if (reachable(dst)) {
 591     incrementq(as_Address(dst));
 592   } else {
 593     lea(rscratch1, dst);
 594     incrementq(Address(rscratch1, 0));
 595   }
 596 }
 597 
 598 void MacroAssembler::incrementq(Register reg, int value) {
 599   if (value == min_jint) { addq(reg, value); return; }
 600   if (value <  0) { decrementq(reg, -value); return; }
 601   if (value == 0) {                        ; return; }
 602   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 603   /* else */      { addq(reg, value)       ; return; }
 604 }
 605 
 606 void MacroAssembler::incrementq(Address dst, int value) {
 607   if (value == min_jint) { addq(dst, value); return; }
 608   if (value <  0) { decrementq(dst, -value); return; }
 609   if (value == 0) {                        ; return; }
 610   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 611   /* else */      { addq(dst, value)       ; return; }
 612 }
 613 
 614 // 32bit can do a case table jump in one instruction but we no longer allow the base
 615 // to be installed in the Address class
 616 void MacroAssembler::jump(ArrayAddress entry) {
 617   lea(rscratch1, entry.base());
 618   Address dispatch = entry.index();
 619   assert(dispatch._base == noreg, "must be");
 620   dispatch._base = rscratch1;
 621   jmp(dispatch);
 622 }
 623 
 624 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 625   ShouldNotReachHere(); // 64bit doesn't use two regs
 626   cmpq(x_lo, y_lo);
 627 }
 628 
 629 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 630     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 631 }
 632 
 633 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 634   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 635   movptr(dst, rscratch1);
 636 }
 637 
 638 void MacroAssembler::leave() {
 639   // %%% is this really better? Why not on 32bit too?
 640   emit_int8((unsigned char)0xC9); // LEAVE
 641 }
 642 
 643 void MacroAssembler::lneg(Register hi, Register lo) {
 644   ShouldNotReachHere(); // 64bit doesn't use two regs
 645   negq(lo);
 646 }
 647 
 648 void MacroAssembler::movoop(Register dst, jobject obj) {
 649   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 650 }
 651 
 652 void MacroAssembler::movoop(Address dst, jobject obj) {
 653   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 654   movq(dst, rscratch1);
 655 }
 656 
 657 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 658   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 659 }
 660 
 661 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 662   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 663   movq(dst, rscratch1);
 664 }
 665 
 666 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 667   if (src.is_lval()) {
 668     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 669   } else {
 670     if (reachable(src)) {
 671       movq(dst, as_Address(src));
 672     } else {
 673       lea(scratch, src);
 674       movq(dst, Address(scratch, 0));
 675     }
 676   }
 677 }
 678 
 679 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 680   movq(as_Address(dst), src);
 681 }
 682 
 683 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 684   movq(dst, as_Address(src));
 685 }
 686 
 687 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 688 void MacroAssembler::movptr(Address dst, intptr_t src) {
 689   if (is_simm32(src)) {
 690     movptr(dst, checked_cast<int32_t>(src));
 691   } else {
 692     mov64(rscratch1, src);
 693     movq(dst, rscratch1);
 694   }
 695 }
 696 
 697 // These are mostly for initializing NULL
 698 void MacroAssembler::movptr(Address dst, int32_t src) {
 699   movslq(dst, src);
 700 }
 701 
 702 void MacroAssembler::movptr(Register dst, int32_t src) {
 703   mov64(dst, (intptr_t)src);
 704 }
 705 
 706 void MacroAssembler::pushoop(jobject obj) {
 707   movoop(rscratch1, obj);
 708   push(rscratch1);
 709 }
 710 
 711 void MacroAssembler::pushklass(Metadata* obj) {
 712   mov_metadata(rscratch1, obj);
 713   push(rscratch1);
 714 }
 715 
 716 void MacroAssembler::pushptr(AddressLiteral src) {
 717   lea(rscratch1, src);
 718   if (src.is_lval()) {
 719     push(rscratch1);
 720   } else {
 721     pushq(Address(rscratch1, 0));
 722   }
 723 }
 724 
 725 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 726   reset_last_Java_frame(r15_thread, clear_fp);
 727 }
 728 
 729 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 730                                          Register last_java_fp,
 731                                          address  last_java_pc) {
 732   vzeroupper();
 733   // determine last_java_sp register
 734   if (!last_java_sp->is_valid()) {
 735     last_java_sp = rsp;
 736   }
 737 
 738   // last_java_fp is optional
 739   if (last_java_fp->is_valid()) {
 740     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 741            last_java_fp);
 742   }
 743 
 744   // last_java_pc is optional
 745   if (last_java_pc != NULL) {
 746     Address java_pc(r15_thread,
 747                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 748     lea(rscratch1, InternalAddress(last_java_pc));
 749     movptr(java_pc, rscratch1);
 750   }
 751 
 752   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 753 }
 754 
 755 static void pass_arg0(MacroAssembler* masm, Register arg) {
 756   if (c_rarg0 != arg ) {
 757     masm->mov(c_rarg0, arg);
 758   }
 759 }
 760 
 761 static void pass_arg1(MacroAssembler* masm, Register arg) {
 762   if (c_rarg1 != arg ) {
 763     masm->mov(c_rarg1, arg);
 764   }
 765 }
 766 
 767 static void pass_arg2(MacroAssembler* masm, Register arg) {
 768   if (c_rarg2 != arg ) {
 769     masm->mov(c_rarg2, arg);
 770   }
 771 }
 772 
 773 static void pass_arg3(MacroAssembler* masm, Register arg) {
 774   if (c_rarg3 != arg ) {
 775     masm->mov(c_rarg3, arg);
 776   }
 777 }
 778 
 779 void MacroAssembler::stop(const char* msg) {
 780   if (ShowMessageBoxOnError) {
 781     address rip = pc();
 782     pusha(); // get regs on stack
 783     lea(c_rarg1, InternalAddress(rip));
 784     movq(c_rarg2, rsp); // pass pointer to regs array
 785   }
 786   lea(c_rarg0, ExternalAddress((address) msg));
 787   andq(rsp, -16); // align stack as required by ABI
 788   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 789   hlt();
 790 }
 791 
 792 void MacroAssembler::warn(const char* msg) {
 793   push(rbp);
 794   movq(rbp, rsp);
 795   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 796   push_CPU_state();   // keeps alignment at 16 bytes
 797   lea(c_rarg0, ExternalAddress((address) msg));
 798   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 799   call(rax);
 800   pop_CPU_state();
 801   mov(rsp, rbp);
 802   pop(rbp);
 803 }
 804 
 805 void MacroAssembler::print_state() {
 806   address rip = pc();
 807   pusha();            // get regs on stack
 808   push(rbp);
 809   movq(rbp, rsp);
 810   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 811   push_CPU_state();   // keeps alignment at 16 bytes
 812 
 813   lea(c_rarg0, InternalAddress(rip));
 814   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 815   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 816 
 817   pop_CPU_state();
 818   mov(rsp, rbp);
 819   pop(rbp);
 820   popa();
 821 }
 822 
 823 #ifndef PRODUCT
 824 extern "C" void findpc(intptr_t x);
 825 #endif
 826 
 827 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 828   // In order to get locks to work, we need to fake a in_VM state
 829   if (ShowMessageBoxOnError) {
 830     JavaThread* thread = JavaThread::current();
 831     JavaThreadState saved_state = thread->thread_state();
 832     thread->set_thread_state(_thread_in_vm);
 833 #ifndef PRODUCT
 834     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 835       ttyLocker ttyl;
 836       BytecodeCounter::print();
 837     }
 838 #endif
 839     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 840     // XXX correct this offset for amd64
 841     // This is the value of eip which points to where verify_oop will return.
 842     if (os::message_box(msg, "Execution stopped, print registers?")) {
 843       print_state64(pc, regs);
 844       BREAKPOINT;
 845     }
 846   }
 847   fatal("DEBUG MESSAGE: %s", msg);
 848 }
 849 
 850 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 851   ttyLocker ttyl;
 852   FlagSetting fs(Debugging, true);
 853   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 854 #ifndef PRODUCT
 855   tty->cr();
 856   findpc(pc);
 857   tty->cr();
 858 #endif
 859 #define PRINT_REG(rax, value) \
 860   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 861   PRINT_REG(rax, regs[15]);
 862   PRINT_REG(rbx, regs[12]);
 863   PRINT_REG(rcx, regs[14]);
 864   PRINT_REG(rdx, regs[13]);
 865   PRINT_REG(rdi, regs[8]);
 866   PRINT_REG(rsi, regs[9]);
 867   PRINT_REG(rbp, regs[10]);
 868   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 869   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 870   PRINT_REG(r8 , regs[7]);
 871   PRINT_REG(r9 , regs[6]);
 872   PRINT_REG(r10, regs[5]);
 873   PRINT_REG(r11, regs[4]);
 874   PRINT_REG(r12, regs[3]);
 875   PRINT_REG(r13, regs[2]);
 876   PRINT_REG(r14, regs[1]);
 877   PRINT_REG(r15, regs[0]);
 878 #undef PRINT_REG
 879   // Print some words near the top of the stack.
 880   int64_t* rsp = &regs[16];
 881   int64_t* dump_sp = rsp;
 882   for (int col1 = 0; col1 < 8; col1++) {
 883     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 884     os::print_location(tty, *dump_sp++);
 885   }
 886   for (int row = 0; row < 25; row++) {
 887     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 888     for (int col = 0; col < 4; col++) {
 889       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 890     }
 891     tty->cr();
 892   }
 893   // Print some instructions around pc:
 894   Disassembler::decode((address)pc-64, (address)pc);
 895   tty->print_cr("--------");
 896   Disassembler::decode((address)pc, (address)pc+32);
 897 }
 898 
 899 // The java_calling_convention describes stack locations as ideal slots on
 900 // a frame with no abi restrictions. Since we must observe abi restrictions
 901 // (like the placement of the register window) the slots must be biased by
 902 // the following value.
 903 static int reg2offset_in(VMReg r) {
 904   // Account for saved rbp and return address
 905   // This should really be in_preserve_stack_slots
 906   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 907 }
 908 
 909 static int reg2offset_out(VMReg r) {
 910   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 911 }
 912 
 913 // A long move
 914 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 915 
 916   // The calling conventions assures us that each VMregpair is either
 917   // all really one physical register or adjacent stack slots.
 918 
 919   if (src.is_single_phys_reg() ) {
 920     if (dst.is_single_phys_reg()) {
 921       if (dst.first() != src.first()) {
 922         mov(dst.first()->as_Register(), src.first()->as_Register());
 923       }
 924     } else {
 925       assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
 926              src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
 927       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
 928     }
 929   } else if (dst.is_single_phys_reg()) {
 930     assert(src.is_single_reg(),  "not a stack pair");
 931     movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 932   } else {
 933     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 934     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 935     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 936   }
 937 }
 938 
 939 // A double move
 940 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 941 
 942   // The calling conventions assures us that each VMregpair is either
 943   // all really one physical register or adjacent stack slots.
 944 
 945   if (src.is_single_phys_reg() ) {
 946     if (dst.is_single_phys_reg()) {
 947       // In theory these overlap but the ordering is such that this is likely a nop
 948       if ( src.first() != dst.first()) {
 949         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 950       }
 951     } else {
 952       assert(dst.is_single_reg(), "not a stack pair");
 953       movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
 954     }
 955   } else if (dst.is_single_phys_reg()) {
 956     assert(src.is_single_reg(),  "not a stack pair");
 957     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 958   } else {
 959     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 960     movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 961     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 962   }
 963 }
 964 
 965 
 966 // A float arg may have to do float reg int reg conversion
 967 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
 968   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 969 
 970   // The calling conventions assures us that each VMregpair is either
 971   // all really one physical register or adjacent stack slots.
 972 
 973   if (src.first()->is_stack()) {
 974     if (dst.first()->is_stack()) {
 975       movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 976       movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
 977     } else {
 978       // stack to reg
 979       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 980       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
 981     }
 982   } else if (dst.first()->is_stack()) {
 983     // reg to stack
 984     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 985     movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
 986   } else {
 987     // reg to reg
 988     // In theory these overlap but the ordering is such that this is likely a nop
 989     if ( src.first() != dst.first()) {
 990       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
 991     }
 992   }
 993 }
 994 
 995 // On 64 bit we will store integer like items to the stack as
 996 // 64 bits items (x86_32/64 abi) even though java would only store
 997 // 32bits for a parameter. On 32bit it will simply be 32 bits
 998 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
 999 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
1000   if (src.first()->is_stack()) {
1001     if (dst.first()->is_stack()) {
1002       // stack to stack
1003       movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
1004       movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
1005     } else {
1006       // stack to reg
1007       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
1008     }
1009   } else if (dst.first()->is_stack()) {
1010     // reg to stack
1011     // Do we really have to sign extend???
1012     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1013     movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
1014   } else {
1015     // Do we really have to sign extend???
1016     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1017     if (dst.first() != src.first()) {
1018       movq(dst.first()->as_Register(), src.first()->as_Register());
1019     }
1020   }
1021 }
1022 
1023 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1024   if (src.first()->is_stack()) {
1025     if (dst.first()->is_stack()) {
1026       // stack to stack
1027       movq(rax, Address(rbp, reg2offset_in(src.first())));
1028       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1029     } else {
1030       // stack to reg
1031       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1032     }
1033   } else if (dst.first()->is_stack()) {
1034     // reg to stack
1035     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1036   } else {
1037     if (dst.first() != src.first()) {
1038       movq(dst.first()->as_Register(), src.first()->as_Register());
1039     }
1040   }
1041 }
1042 
1043 // An oop arg. Must pass a handle not the oop itself
1044 void MacroAssembler::object_move(OopMap* map,
1045                         int oop_handle_offset,
1046                         int framesize_in_slots,
1047                         VMRegPair src,
1048                         VMRegPair dst,
1049                         bool is_receiver,
1050                         int* receiver_offset) {
1051 
1052   // must pass a handle. First figure out the location we use as a handle
1053 
1054   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1055 
1056   // See if oop is NULL if it is we need no handle
1057 
1058   if (src.first()->is_stack()) {
1059 
1060     // Oop is already on the stack as an argument
1061     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1062     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1063     if (is_receiver) {
1064       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1065     }
1066 
1067     cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1068     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1069     // conditionally move a NULL
1070     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1071   } else {
1072 
1073     // Oop is in a register we must store it to the space we reserve
1074     // on the stack for oop_handles and pass a handle if oop is non-NULL
1075 
1076     const Register rOop = src.first()->as_Register();
1077     int oop_slot;
1078     if (rOop == j_rarg0)
1079       oop_slot = 0;
1080     else if (rOop == j_rarg1)
1081       oop_slot = 1;
1082     else if (rOop == j_rarg2)
1083       oop_slot = 2;
1084     else if (rOop == j_rarg3)
1085       oop_slot = 3;
1086     else if (rOop == j_rarg4)
1087       oop_slot = 4;
1088     else {
1089       assert(rOop == j_rarg5, "wrong register");
1090       oop_slot = 5;
1091     }
1092 
1093     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1094     int offset = oop_slot*VMRegImpl::stack_slot_size;
1095 
1096     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1097     // Store oop in handle area, may be NULL
1098     movptr(Address(rsp, offset), rOop);
1099     if (is_receiver) {
1100       *receiver_offset = offset;
1101     }
1102 
1103     cmpptr(rOop, (int32_t)NULL_WORD);
1104     lea(rHandle, Address(rsp, offset));
1105     // conditionally move a NULL from the handle area where it was just stored
1106     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1107   }
1108 
1109   // If arg is on the stack then place it otherwise it is already in correct reg.
1110   if (dst.first()->is_stack()) {
1111     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1112   }
1113 }
1114 
1115 #endif // _LP64
1116 
1117 // Now versions that are common to 32/64 bit
1118 
1119 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1120   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1121 }
1122 
1123 void MacroAssembler::addptr(Register dst, Register src) {
1124   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1125 }
1126 
1127 void MacroAssembler::addptr(Address dst, Register src) {
1128   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1129 }
1130 
1131 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1132   if (reachable(src)) {
1133     Assembler::addsd(dst, as_Address(src));
1134   } else {
1135     lea(rscratch1, src);
1136     Assembler::addsd(dst, Address(rscratch1, 0));
1137   }
1138 }
1139 
1140 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1141   if (reachable(src)) {
1142     addss(dst, as_Address(src));
1143   } else {
1144     lea(rscratch1, src);
1145     addss(dst, Address(rscratch1, 0));
1146   }
1147 }
1148 
1149 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1150   if (reachable(src)) {
1151     Assembler::addpd(dst, as_Address(src));
1152   } else {
1153     lea(rscratch1, src);
1154     Assembler::addpd(dst, Address(rscratch1, 0));
1155   }
1156 }
1157 
1158 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1159 // Stub code is generated once and never copied.
1160 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1161 void MacroAssembler::align64() {
1162   align(64, (unsigned long long) pc());
1163 }
1164 
1165 void MacroAssembler::align32() {
1166   align(32, (unsigned long long) pc());
1167 }
1168 
1169 void MacroAssembler::align(int modulus) {
1170   // 8273459: Ensure alignment is possible with current segment alignment
1171   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1172   align(modulus, offset());
1173 }
1174 
1175 void MacroAssembler::align(int modulus, int target) {
1176   if (target % modulus != 0) {
1177     nop(modulus - (target % modulus));
1178   }
1179 }
1180 
1181 void MacroAssembler::push_f(XMMRegister r) {
1182   subptr(rsp, wordSize);
1183   movflt(Address(rsp, 0), r);
1184 }
1185 
1186 void MacroAssembler::pop_f(XMMRegister r) {
1187   movflt(r, Address(rsp, 0));
1188   addptr(rsp, wordSize);
1189 }
1190 
1191 void MacroAssembler::push_d(XMMRegister r) {
1192   subptr(rsp, 2 * wordSize);
1193   movdbl(Address(rsp, 0), r);
1194 }
1195 
1196 void MacroAssembler::pop_d(XMMRegister r) {
1197   movdbl(r, Address(rsp, 0));
1198   addptr(rsp, 2 * Interpreter::stackElementSize);
1199 }
1200 
1201 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1202   // Used in sign-masking with aligned address.
1203   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1204   if (reachable(src)) {
1205     Assembler::andpd(dst, as_Address(src));
1206   } else {
1207     lea(scratch_reg, src);
1208     Assembler::andpd(dst, Address(scratch_reg, 0));
1209   }
1210 }
1211 
1212 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1213   // Used in sign-masking with aligned address.
1214   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1215   if (reachable(src)) {
1216     Assembler::andps(dst, as_Address(src));
1217   } else {
1218     lea(scratch_reg, src);
1219     Assembler::andps(dst, Address(scratch_reg, 0));
1220   }
1221 }
1222 
1223 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1224   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1225 }
1226 
1227 void MacroAssembler::atomic_incl(Address counter_addr) {
1228   lock();
1229   incrementl(counter_addr);
1230 }
1231 
1232 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1233   if (reachable(counter_addr)) {
1234     atomic_incl(as_Address(counter_addr));
1235   } else {
1236     lea(scr, counter_addr);
1237     atomic_incl(Address(scr, 0));
1238   }
1239 }
1240 
1241 #ifdef _LP64
1242 void MacroAssembler::atomic_incq(Address counter_addr) {
1243   lock();
1244   incrementq(counter_addr);
1245 }
1246 
1247 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1248   if (reachable(counter_addr)) {
1249     atomic_incq(as_Address(counter_addr));
1250   } else {
1251     lea(scr, counter_addr);
1252     atomic_incq(Address(scr, 0));
1253   }
1254 }
1255 #endif
1256 
1257 // Writes to stack successive pages until offset reached to check for
1258 // stack overflow + shadow pages.  This clobbers tmp.
1259 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1260   movptr(tmp, rsp);
1261   // Bang stack for total size given plus shadow page size.
1262   // Bang one page at a time because large size can bang beyond yellow and
1263   // red zones.
1264   Label loop;
1265   bind(loop);
1266   movl(Address(tmp, (-os::vm_page_size())), size );
1267   subptr(tmp, os::vm_page_size());
1268   subl(size, os::vm_page_size());
1269   jcc(Assembler::greater, loop);
1270 
1271   // Bang down shadow pages too.
1272   // At this point, (tmp-0) is the last address touched, so don't
1273   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1274   // was post-decremented.)  Skip this address by starting at i=1, and
1275   // touch a few more pages below.  N.B.  It is important to touch all
1276   // the way down including all pages in the shadow zone.
1277   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1278     // this could be any sized move but this is can be a debugging crumb
1279     // so the bigger the better.
1280     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1281   }
1282 }
1283 
1284 void MacroAssembler::reserved_stack_check() {
1285     // testing if reserved zone needs to be enabled
1286     Label no_reserved_zone_enabling;
1287     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1288     NOT_LP64(get_thread(rsi);)
1289 
1290     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1291     jcc(Assembler::below, no_reserved_zone_enabling);
1292 
1293     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1294     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1295     should_not_reach_here();
1296 
1297     bind(no_reserved_zone_enabling);
1298 }
1299 
1300 void MacroAssembler::c2bool(Register x) {
1301   // implements x == 0 ? 0 : 1
1302   // note: must only look at least-significant byte of x
1303   //       since C-style booleans are stored in one byte
1304   //       only! (was bug)
1305   andl(x, 0xFF);
1306   setb(Assembler::notZero, x);
1307 }
1308 
1309 // Wouldn't need if AddressLiteral version had new name
1310 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1311   Assembler::call(L, rtype);
1312 }
1313 
1314 void MacroAssembler::call(Register entry) {
1315   Assembler::call(entry);
1316 }
1317 
1318 void MacroAssembler::call(AddressLiteral entry) {
1319   if (reachable(entry)) {
1320     Assembler::call_literal(entry.target(), entry.rspec());
1321   } else {
1322     lea(rscratch1, entry);
1323     Assembler::call(rscratch1);
1324   }
1325 }
1326 
1327 void MacroAssembler::ic_call(address entry, jint method_index) {
1328   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1329   movptr(rax, (intptr_t)Universe::non_oop_word());
1330   call(AddressLiteral(entry, rh));
1331 }
1332 
1333 void MacroAssembler::emit_static_call_stub() {
1334   // Static stub relocation also tags the Method* in the code-stream.
1335   mov_metadata(rbx, (Metadata*) NULL);  // Method is zapped till fixup time.
1336   // This is recognized as unresolved by relocs/nativeinst/ic code.
1337   jump(RuntimeAddress(pc()));
1338 }
1339 
1340 // Implementation of call_VM versions
1341 
1342 void MacroAssembler::call_VM(Register oop_result,
1343                              address entry_point,
1344                              bool check_exceptions) {
1345   Label C, E;
1346   call(C, relocInfo::none);
1347   jmp(E);
1348 
1349   bind(C);
1350   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1351   ret(0);
1352 
1353   bind(E);
1354 }
1355 
1356 void MacroAssembler::call_VM(Register oop_result,
1357                              address entry_point,
1358                              Register arg_1,
1359                              bool check_exceptions) {
1360   Label C, E;
1361   call(C, relocInfo::none);
1362   jmp(E);
1363 
1364   bind(C);
1365   pass_arg1(this, arg_1);
1366   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1367   ret(0);
1368 
1369   bind(E);
1370 }
1371 
1372 void MacroAssembler::call_VM(Register oop_result,
1373                              address entry_point,
1374                              Register arg_1,
1375                              Register arg_2,
1376                              bool check_exceptions) {
1377   Label C, E;
1378   call(C, relocInfo::none);
1379   jmp(E);
1380 
1381   bind(C);
1382 
1383   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1384 
1385   pass_arg2(this, arg_2);
1386   pass_arg1(this, arg_1);
1387   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1388   ret(0);
1389 
1390   bind(E);
1391 }
1392 
1393 void MacroAssembler::call_VM(Register oop_result,
1394                              address entry_point,
1395                              Register arg_1,
1396                              Register arg_2,
1397                              Register arg_3,
1398                              bool check_exceptions) {
1399   Label C, E;
1400   call(C, relocInfo::none);
1401   jmp(E);
1402 
1403   bind(C);
1404 
1405   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1406   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1407   pass_arg3(this, arg_3);
1408 
1409   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1410   pass_arg2(this, arg_2);
1411 
1412   pass_arg1(this, arg_1);
1413   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1414   ret(0);
1415 
1416   bind(E);
1417 }
1418 
1419 void MacroAssembler::call_VM(Register oop_result,
1420                              Register last_java_sp,
1421                              address entry_point,
1422                              int number_of_arguments,
1423                              bool check_exceptions) {
1424   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1425   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1426 }
1427 
1428 void MacroAssembler::call_VM(Register oop_result,
1429                              Register last_java_sp,
1430                              address entry_point,
1431                              Register arg_1,
1432                              bool check_exceptions) {
1433   pass_arg1(this, arg_1);
1434   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1435 }
1436 
1437 void MacroAssembler::call_VM(Register oop_result,
1438                              Register last_java_sp,
1439                              address entry_point,
1440                              Register arg_1,
1441                              Register arg_2,
1442                              bool check_exceptions) {
1443 
1444   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1445   pass_arg2(this, arg_2);
1446   pass_arg1(this, arg_1);
1447   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1448 }
1449 
1450 void MacroAssembler::call_VM(Register oop_result,
1451                              Register last_java_sp,
1452                              address entry_point,
1453                              Register arg_1,
1454                              Register arg_2,
1455                              Register arg_3,
1456                              bool check_exceptions) {
1457   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1458   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1459   pass_arg3(this, arg_3);
1460   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1461   pass_arg2(this, arg_2);
1462   pass_arg1(this, arg_1);
1463   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1464 }
1465 
1466 void MacroAssembler::super_call_VM(Register oop_result,
1467                                    Register last_java_sp,
1468                                    address entry_point,
1469                                    int number_of_arguments,
1470                                    bool check_exceptions) {
1471   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1472   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1473 }
1474 
1475 void MacroAssembler::super_call_VM(Register oop_result,
1476                                    Register last_java_sp,
1477                                    address entry_point,
1478                                    Register arg_1,
1479                                    bool check_exceptions) {
1480   pass_arg1(this, arg_1);
1481   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1482 }
1483 
1484 void MacroAssembler::super_call_VM(Register oop_result,
1485                                    Register last_java_sp,
1486                                    address entry_point,
1487                                    Register arg_1,
1488                                    Register arg_2,
1489                                    bool check_exceptions) {
1490 
1491   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1492   pass_arg2(this, arg_2);
1493   pass_arg1(this, arg_1);
1494   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1495 }
1496 
1497 void MacroAssembler::super_call_VM(Register oop_result,
1498                                    Register last_java_sp,
1499                                    address entry_point,
1500                                    Register arg_1,
1501                                    Register arg_2,
1502                                    Register arg_3,
1503                                    bool check_exceptions) {
1504   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1505   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1506   pass_arg3(this, arg_3);
1507   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1508   pass_arg2(this, arg_2);
1509   pass_arg1(this, arg_1);
1510   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1511 }
1512 
1513 void MacroAssembler::call_VM_base(Register oop_result,
1514                                   Register java_thread,
1515                                   Register last_java_sp,
1516                                   address  entry_point,
1517                                   int      number_of_arguments,
1518                                   bool     check_exceptions) {
1519   // determine java_thread register
1520   if (!java_thread->is_valid()) {
1521 #ifdef _LP64
1522     java_thread = r15_thread;
1523 #else
1524     java_thread = rdi;
1525     get_thread(java_thread);
1526 #endif // LP64
1527   }
1528   // determine last_java_sp register
1529   if (!last_java_sp->is_valid()) {
1530     last_java_sp = rsp;
1531   }
1532   // debugging support
1533   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1534   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1535 #ifdef ASSERT
1536   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1537   // r12 is the heapbase.
1538   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1539 #endif // ASSERT
1540 
1541   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1542   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1543 
1544   // push java thread (becomes first argument of C function)
1545 
1546   NOT_LP64(push(java_thread); number_of_arguments++);
1547   LP64_ONLY(mov(c_rarg0, r15_thread));
1548 
1549   // set last Java frame before call
1550   assert(last_java_sp != rbp, "can't use ebp/rbp");
1551 
1552   // Only interpreter should have to set fp
1553   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1554 
1555   // do the call, remove parameters
1556   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1557 
1558   // restore the thread (cannot use the pushed argument since arguments
1559   // may be overwritten by C code generated by an optimizing compiler);
1560   // however can use the register value directly if it is callee saved.
1561   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1562     // rdi & rsi (also r15) are callee saved -> nothing to do
1563 #ifdef ASSERT
1564     guarantee(java_thread != rax, "change this code");
1565     push(rax);
1566     { Label L;
1567       get_thread(rax);
1568       cmpptr(java_thread, rax);
1569       jcc(Assembler::equal, L);
1570       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1571       bind(L);
1572     }
1573     pop(rax);
1574 #endif
1575   } else {
1576     get_thread(java_thread);
1577   }
1578   // reset last Java frame
1579   // Only interpreter should have to clear fp
1580   reset_last_Java_frame(java_thread, true);
1581 
1582    // C++ interp handles this in the interpreter
1583   check_and_handle_popframe(java_thread);
1584   check_and_handle_earlyret(java_thread);
1585 
1586   if (check_exceptions) {
1587     // check for pending exceptions (java_thread is set upon return)
1588     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1589 #ifndef _LP64
1590     jump_cc(Assembler::notEqual,
1591             RuntimeAddress(StubRoutines::forward_exception_entry()));
1592 #else
1593     // This used to conditionally jump to forward_exception however it is
1594     // possible if we relocate that the branch will not reach. So we must jump
1595     // around so we can always reach
1596 
1597     Label ok;
1598     jcc(Assembler::equal, ok);
1599     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1600     bind(ok);
1601 #endif // LP64
1602   }
1603 
1604   // get oop result if there is one and reset the value in the thread
1605   if (oop_result->is_valid()) {
1606     get_vm_result(oop_result, java_thread);
1607   }
1608 }
1609 
1610 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1611 
1612   // Calculate the value for last_Java_sp
1613   // somewhat subtle. call_VM does an intermediate call
1614   // which places a return address on the stack just under the
1615   // stack pointer as the user finished with it. This allows
1616   // use to retrieve last_Java_pc from last_Java_sp[-1].
1617   // On 32bit we then have to push additional args on the stack to accomplish
1618   // the actual requested call. On 64bit call_VM only can use register args
1619   // so the only extra space is the return address that call_VM created.
1620   // This hopefully explains the calculations here.
1621 
1622 #ifdef _LP64
1623   // We've pushed one address, correct last_Java_sp
1624   lea(rax, Address(rsp, wordSize));
1625 #else
1626   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1627 #endif // LP64
1628 
1629   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1630 
1631 }
1632 
1633 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1634 void MacroAssembler::call_VM_leaf0(address entry_point) {
1635   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1636 }
1637 
1638 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1639   call_VM_leaf_base(entry_point, number_of_arguments);
1640 }
1641 
1642 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1643   pass_arg0(this, arg_0);
1644   call_VM_leaf(entry_point, 1);
1645 }
1646 
1647 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1648 
1649   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1650   pass_arg1(this, arg_1);
1651   pass_arg0(this, arg_0);
1652   call_VM_leaf(entry_point, 2);
1653 }
1654 
1655 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1656   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1657   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1658   pass_arg2(this, arg_2);
1659   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1660   pass_arg1(this, arg_1);
1661   pass_arg0(this, arg_0);
1662   call_VM_leaf(entry_point, 3);
1663 }
1664 
1665 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1666   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1667   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1668   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1669   pass_arg3(this, arg_3);
1670   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1671   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1672   pass_arg2(this, arg_2);
1673   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1674   pass_arg1(this, arg_1);
1675   pass_arg0(this, arg_0);
1676   call_VM_leaf(entry_point, 3);
1677 }
1678 
1679 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1680   pass_arg0(this, arg_0);
1681   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1682 }
1683 
1684 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1685 
1686   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1687   pass_arg1(this, arg_1);
1688   pass_arg0(this, arg_0);
1689   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1690 }
1691 
1692 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1693   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1694   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1695   pass_arg2(this, arg_2);
1696   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1697   pass_arg1(this, arg_1);
1698   pass_arg0(this, arg_0);
1699   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1700 }
1701 
1702 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1703   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1704   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1705   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1706   pass_arg3(this, arg_3);
1707   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1708   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1709   pass_arg2(this, arg_2);
1710   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1711   pass_arg1(this, arg_1);
1712   pass_arg0(this, arg_0);
1713   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1714 }
1715 
1716 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1717   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1718   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1719   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1720 }
1721 
1722 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1723   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1724   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1725 }
1726 
1727 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1728 }
1729 
1730 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1731 }
1732 
1733 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1734   if (reachable(src1)) {
1735     cmpl(as_Address(src1), imm);
1736   } else {
1737     lea(rscratch1, src1);
1738     cmpl(Address(rscratch1, 0), imm);
1739   }
1740 }
1741 
1742 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1743   assert(!src2.is_lval(), "use cmpptr");
1744   if (reachable(src2)) {
1745     cmpl(src1, as_Address(src2));
1746   } else {
1747     lea(rscratch1, src2);
1748     cmpl(src1, Address(rscratch1, 0));
1749   }
1750 }
1751 
1752 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1753   Assembler::cmpl(src1, imm);
1754 }
1755 
1756 void MacroAssembler::cmp32(Register src1, Address src2) {
1757   Assembler::cmpl(src1, src2);
1758 }
1759 
1760 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1761   ucomisd(opr1, opr2);
1762 
1763   Label L;
1764   if (unordered_is_less) {
1765     movl(dst, -1);
1766     jcc(Assembler::parity, L);
1767     jcc(Assembler::below , L);
1768     movl(dst, 0);
1769     jcc(Assembler::equal , L);
1770     increment(dst);
1771   } else { // unordered is greater
1772     movl(dst, 1);
1773     jcc(Assembler::parity, L);
1774     jcc(Assembler::above , L);
1775     movl(dst, 0);
1776     jcc(Assembler::equal , L);
1777     decrementl(dst);
1778   }
1779   bind(L);
1780 }
1781 
1782 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1783   ucomiss(opr1, opr2);
1784 
1785   Label L;
1786   if (unordered_is_less) {
1787     movl(dst, -1);
1788     jcc(Assembler::parity, L);
1789     jcc(Assembler::below , L);
1790     movl(dst, 0);
1791     jcc(Assembler::equal , L);
1792     increment(dst);
1793   } else { // unordered is greater
1794     movl(dst, 1);
1795     jcc(Assembler::parity, L);
1796     jcc(Assembler::above , L);
1797     movl(dst, 0);
1798     jcc(Assembler::equal , L);
1799     decrementl(dst);
1800   }
1801   bind(L);
1802 }
1803 
1804 
1805 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1806   if (reachable(src1)) {
1807     cmpb(as_Address(src1), imm);
1808   } else {
1809     lea(rscratch1, src1);
1810     cmpb(Address(rscratch1, 0), imm);
1811   }
1812 }
1813 
1814 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1815 #ifdef _LP64
1816   if (src2.is_lval()) {
1817     movptr(rscratch1, src2);
1818     Assembler::cmpq(src1, rscratch1);
1819   } else if (reachable(src2)) {
1820     cmpq(src1, as_Address(src2));
1821   } else {
1822     lea(rscratch1, src2);
1823     Assembler::cmpq(src1, Address(rscratch1, 0));
1824   }
1825 #else
1826   if (src2.is_lval()) {
1827     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1828   } else {
1829     cmpl(src1, as_Address(src2));
1830   }
1831 #endif // _LP64
1832 }
1833 
1834 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
1835   assert(src2.is_lval(), "not a mem-mem compare");
1836 #ifdef _LP64
1837   // moves src2's literal address
1838   movptr(rscratch1, src2);
1839   Assembler::cmpq(src1, rscratch1);
1840 #else
1841   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1842 #endif // _LP64
1843 }
1844 
1845 void MacroAssembler::cmpoop(Register src1, Register src2) {
1846   cmpptr(src1, src2);
1847 }
1848 
1849 void MacroAssembler::cmpoop(Register src1, Address src2) {
1850   cmpptr(src1, src2);
1851 }
1852 
1853 #ifdef _LP64
1854 void MacroAssembler::cmpoop(Register src1, jobject src2) {
1855   movoop(rscratch1, src2);
1856   cmpptr(src1, rscratch1);
1857 }
1858 #endif
1859 
1860 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
1861   if (reachable(adr)) {
1862     lock();
1863     cmpxchgptr(reg, as_Address(adr));
1864   } else {
1865     lea(rscratch1, adr);
1866     lock();
1867     cmpxchgptr(reg, Address(rscratch1, 0));
1868   }
1869 }
1870 
1871 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1872   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1873 }
1874 
1875 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
1876   if (reachable(src)) {
1877     Assembler::comisd(dst, as_Address(src));
1878   } else {
1879     lea(rscratch1, src);
1880     Assembler::comisd(dst, Address(rscratch1, 0));
1881   }
1882 }
1883 
1884 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
1885   if (reachable(src)) {
1886     Assembler::comiss(dst, as_Address(src));
1887   } else {
1888     lea(rscratch1, src);
1889     Assembler::comiss(dst, Address(rscratch1, 0));
1890   }
1891 }
1892 
1893 
1894 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
1895   Condition negated_cond = negate_condition(cond);
1896   Label L;
1897   jcc(negated_cond, L);
1898   pushf(); // Preserve flags
1899   atomic_incl(counter_addr);
1900   popf();
1901   bind(L);
1902 }
1903 
1904 int MacroAssembler::corrected_idivl(Register reg) {
1905   // Full implementation of Java idiv and irem; checks for
1906   // special case as described in JVM spec., p.243 & p.271.
1907   // The function returns the (pc) offset of the idivl
1908   // instruction - may be needed for implicit exceptions.
1909   //
1910   //         normal case                           special case
1911   //
1912   // input : rax,: dividend                         min_int
1913   //         reg: divisor   (may not be rax,/rdx)   -1
1914   //
1915   // output: rax,: quotient  (= rax, idiv reg)       min_int
1916   //         rdx: remainder (= rax, irem reg)       0
1917   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1918   const int min_int = 0x80000000;
1919   Label normal_case, special_case;
1920 
1921   // check for special case
1922   cmpl(rax, min_int);
1923   jcc(Assembler::notEqual, normal_case);
1924   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1925   cmpl(reg, -1);
1926   jcc(Assembler::equal, special_case);
1927 
1928   // handle normal case
1929   bind(normal_case);
1930   cdql();
1931   int idivl_offset = offset();
1932   idivl(reg);
1933 
1934   // normal and special case exit
1935   bind(special_case);
1936 
1937   return idivl_offset;
1938 }
1939 
1940 
1941 
1942 void MacroAssembler::decrementl(Register reg, int value) {
1943   if (value == min_jint) {subl(reg, value) ; return; }
1944   if (value <  0) { incrementl(reg, -value); return; }
1945   if (value == 0) {                        ; return; }
1946   if (value == 1 && UseIncDec) { decl(reg) ; return; }
1947   /* else */      { subl(reg, value)       ; return; }
1948 }
1949 
1950 void MacroAssembler::decrementl(Address dst, int value) {
1951   if (value == min_jint) {subl(dst, value) ; return; }
1952   if (value <  0) { incrementl(dst, -value); return; }
1953   if (value == 0) {                        ; return; }
1954   if (value == 1 && UseIncDec) { decl(dst) ; return; }
1955   /* else */      { subl(dst, value)       ; return; }
1956 }
1957 
1958 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1959   assert(shift_value > 0, "illegal shift value");
1960   Label _is_positive;
1961   testl (reg, reg);
1962   jcc (Assembler::positive, _is_positive);
1963   int offset = (1 << shift_value) - 1 ;
1964 
1965   if (offset == 1) {
1966     incrementl(reg);
1967   } else {
1968     addl(reg, offset);
1969   }
1970 
1971   bind (_is_positive);
1972   sarl(reg, shift_value);
1973 }
1974 
1975 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
1976   if (reachable(src)) {
1977     Assembler::divsd(dst, as_Address(src));
1978   } else {
1979     lea(rscratch1, src);
1980     Assembler::divsd(dst, Address(rscratch1, 0));
1981   }
1982 }
1983 
1984 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
1985   if (reachable(src)) {
1986     Assembler::divss(dst, as_Address(src));
1987   } else {
1988     lea(rscratch1, src);
1989     Assembler::divss(dst, Address(rscratch1, 0));
1990   }
1991 }
1992 
1993 void MacroAssembler::enter() {
1994   push(rbp);
1995   mov(rbp, rsp);
1996 }
1997 
1998 void MacroAssembler::post_call_nop() {
1999   if (!Continuations::enabled()) {
2000     return;
2001   }
2002   InstructionMark im(this);
2003   relocate(post_call_nop_Relocation::spec());
2004   emit_int8((int8_t)0x0f);
2005   emit_int8((int8_t)0x1f);
2006   emit_int8((int8_t)0x84);
2007   emit_int8((int8_t)0x00);
2008   emit_int32(0x00);
2009 }
2010 
2011 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2012 void MacroAssembler::fat_nop() {
2013   if (UseAddressNop) {
2014     addr_nop_5();
2015   } else {
2016     emit_int8((int8_t)0x26); // es:
2017     emit_int8((int8_t)0x2e); // cs:
2018     emit_int8((int8_t)0x64); // fs:
2019     emit_int8((int8_t)0x65); // gs:
2020     emit_int8((int8_t)0x90);
2021   }
2022 }
2023 
2024 #ifndef _LP64
2025 void MacroAssembler::fcmp(Register tmp) {
2026   fcmp(tmp, 1, true, true);
2027 }
2028 
2029 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2030   assert(!pop_right || pop_left, "usage error");
2031   if (VM_Version::supports_cmov()) {
2032     assert(tmp == noreg, "unneeded temp");
2033     if (pop_left) {
2034       fucomip(index);
2035     } else {
2036       fucomi(index);
2037     }
2038     if (pop_right) {
2039       fpop();
2040     }
2041   } else {
2042     assert(tmp != noreg, "need temp");
2043     if (pop_left) {
2044       if (pop_right) {
2045         fcompp();
2046       } else {
2047         fcomp(index);
2048       }
2049     } else {
2050       fcom(index);
2051     }
2052     // convert FPU condition into eflags condition via rax,
2053     save_rax(tmp);
2054     fwait(); fnstsw_ax();
2055     sahf();
2056     restore_rax(tmp);
2057   }
2058   // condition codes set as follows:
2059   //
2060   // CF (corresponds to C0) if x < y
2061   // PF (corresponds to C2) if unordered
2062   // ZF (corresponds to C3) if x = y
2063 }
2064 
2065 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2066   fcmp2int(dst, unordered_is_less, 1, true, true);
2067 }
2068 
2069 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2070   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2071   Label L;
2072   if (unordered_is_less) {
2073     movl(dst, -1);
2074     jcc(Assembler::parity, L);
2075     jcc(Assembler::below , L);
2076     movl(dst, 0);
2077     jcc(Assembler::equal , L);
2078     increment(dst);
2079   } else { // unordered is greater
2080     movl(dst, 1);
2081     jcc(Assembler::parity, L);
2082     jcc(Assembler::above , L);
2083     movl(dst, 0);
2084     jcc(Assembler::equal , L);
2085     decrementl(dst);
2086   }
2087   bind(L);
2088 }
2089 
2090 void MacroAssembler::fld_d(AddressLiteral src) {
2091   fld_d(as_Address(src));
2092 }
2093 
2094 void MacroAssembler::fld_s(AddressLiteral src) {
2095   fld_s(as_Address(src));
2096 }
2097 
2098 void MacroAssembler::fldcw(AddressLiteral src) {
2099   Assembler::fldcw(as_Address(src));
2100 }
2101 
2102 void MacroAssembler::fpop() {
2103   ffree();
2104   fincstp();
2105 }
2106 
2107 void MacroAssembler::fremr(Register tmp) {
2108   save_rax(tmp);
2109   { Label L;
2110     bind(L);
2111     fprem();
2112     fwait(); fnstsw_ax();
2113     sahf();
2114     jcc(Assembler::parity, L);
2115   }
2116   restore_rax(tmp);
2117   // Result is in ST0.
2118   // Note: fxch & fpop to get rid of ST1
2119   // (otherwise FPU stack could overflow eventually)
2120   fxch(1);
2121   fpop();
2122 }
2123 
2124 void MacroAssembler::empty_FPU_stack() {
2125   if (VM_Version::supports_mmx()) {
2126     emms();
2127   } else {
2128     for (int i = 8; i-- > 0; ) ffree(i);
2129   }
2130 }
2131 #endif // !LP64
2132 
2133 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2134   if (reachable(src)) {
2135     Assembler::mulpd(dst, as_Address(src));
2136   } else {
2137     lea(rscratch1, src);
2138     Assembler::mulpd(dst, Address(rscratch1, 0));
2139   }
2140 }
2141 
2142 void MacroAssembler::load_float(Address src) {
2143 #ifdef _LP64
2144   movflt(xmm0, src);
2145 #else
2146   if (UseSSE >= 1) {
2147     movflt(xmm0, src);
2148   } else {
2149     fld_s(src);
2150   }
2151 #endif // LP64
2152 }
2153 
2154 void MacroAssembler::store_float(Address dst) {
2155 #ifdef _LP64
2156   movflt(dst, xmm0);
2157 #else
2158   if (UseSSE >= 1) {
2159     movflt(dst, xmm0);
2160   } else {
2161     fstp_s(dst);
2162   }
2163 #endif // LP64
2164 }
2165 
2166 void MacroAssembler::load_double(Address src) {
2167 #ifdef _LP64
2168   movdbl(xmm0, src);
2169 #else
2170   if (UseSSE >= 2) {
2171     movdbl(xmm0, src);
2172   } else {
2173     fld_d(src);
2174   }
2175 #endif // LP64
2176 }
2177 
2178 void MacroAssembler::store_double(Address dst) {
2179 #ifdef _LP64
2180   movdbl(dst, xmm0);
2181 #else
2182   if (UseSSE >= 2) {
2183     movdbl(dst, xmm0);
2184   } else {
2185     fstp_d(dst);
2186   }
2187 #endif // LP64
2188 }
2189 
2190 // dst = c = a * b + c
2191 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2192   Assembler::vfmadd231sd(c, a, b);
2193   if (dst != c) {
2194     movdbl(dst, c);
2195   }
2196 }
2197 
2198 // dst = c = a * b + c
2199 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2200   Assembler::vfmadd231ss(c, a, b);
2201   if (dst != c) {
2202     movflt(dst, c);
2203   }
2204 }
2205 
2206 // dst = c = a * b + c
2207 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2208   Assembler::vfmadd231pd(c, a, b, vector_len);
2209   if (dst != c) {
2210     vmovdqu(dst, c);
2211   }
2212 }
2213 
2214 // dst = c = a * b + c
2215 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2216   Assembler::vfmadd231ps(c, a, b, vector_len);
2217   if (dst != c) {
2218     vmovdqu(dst, c);
2219   }
2220 }
2221 
2222 // dst = c = a * b + c
2223 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2224   Assembler::vfmadd231pd(c, a, b, vector_len);
2225   if (dst != c) {
2226     vmovdqu(dst, c);
2227   }
2228 }
2229 
2230 // dst = c = a * b + c
2231 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2232   Assembler::vfmadd231ps(c, a, b, vector_len);
2233   if (dst != c) {
2234     vmovdqu(dst, c);
2235   }
2236 }
2237 
2238 void MacroAssembler::incrementl(AddressLiteral dst) {
2239   if (reachable(dst)) {
2240     incrementl(as_Address(dst));
2241   } else {
2242     lea(rscratch1, dst);
2243     incrementl(Address(rscratch1, 0));
2244   }
2245 }
2246 
2247 void MacroAssembler::incrementl(ArrayAddress dst) {
2248   incrementl(as_Address(dst));
2249 }
2250 
2251 void MacroAssembler::incrementl(Register reg, int value) {
2252   if (value == min_jint) {addl(reg, value) ; return; }
2253   if (value <  0) { decrementl(reg, -value); return; }
2254   if (value == 0) {                        ; return; }
2255   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2256   /* else */      { addl(reg, value)       ; return; }
2257 }
2258 
2259 void MacroAssembler::incrementl(Address dst, int value) {
2260   if (value == min_jint) {addl(dst, value) ; return; }
2261   if (value <  0) { decrementl(dst, -value); return; }
2262   if (value == 0) {                        ; return; }
2263   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2264   /* else */      { addl(dst, value)       ; return; }
2265 }
2266 
2267 void MacroAssembler::jump(AddressLiteral dst) {
2268   if (reachable(dst)) {
2269     jmp_literal(dst.target(), dst.rspec());
2270   } else {
2271     lea(rscratch1, dst);
2272     jmp(rscratch1);
2273   }
2274 }
2275 
2276 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2277   if (reachable(dst)) {
2278     InstructionMark im(this);
2279     relocate(dst.reloc());
2280     const int short_size = 2;
2281     const int long_size = 6;
2282     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2283     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2284       // 0111 tttn #8-bit disp
2285       emit_int8(0x70 | cc);
2286       emit_int8((offs - short_size) & 0xFF);
2287     } else {
2288       // 0000 1111 1000 tttn #32-bit disp
2289       emit_int8(0x0F);
2290       emit_int8((unsigned char)(0x80 | cc));
2291       emit_int32(offs - long_size);
2292     }
2293   } else {
2294 #ifdef ASSERT
2295     warning("reversing conditional branch");
2296 #endif /* ASSERT */
2297     Label skip;
2298     jccb(reverse[cc], skip);
2299     lea(rscratch1, dst);
2300     Assembler::jmp(rscratch1);
2301     bind(skip);
2302   }
2303 }
2304 
2305 void MacroAssembler::fld_x(AddressLiteral src) {
2306   Assembler::fld_x(as_Address(src));
2307 }
2308 
2309 void MacroAssembler::ldmxcsr(AddressLiteral src, Register scratchReg) {
2310   if (reachable(src)) {
2311     Assembler::ldmxcsr(as_Address(src));
2312   } else {
2313     lea(scratchReg, src);
2314     Assembler::ldmxcsr(Address(scratchReg, 0));
2315   }
2316 }
2317 
2318 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2319   int off;
2320   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2321     off = offset();
2322     movsbl(dst, src); // movsxb
2323   } else {
2324     off = load_unsigned_byte(dst, src);
2325     shll(dst, 24);
2326     sarl(dst, 24);
2327   }
2328   return off;
2329 }
2330 
2331 // Note: load_signed_short used to be called load_signed_word.
2332 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2333 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2334 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2335 int MacroAssembler::load_signed_short(Register dst, Address src) {
2336   int off;
2337   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2338     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2339     // version but this is what 64bit has always done. This seems to imply
2340     // that users are only using 32bits worth.
2341     off = offset();
2342     movswl(dst, src); // movsxw
2343   } else {
2344     off = load_unsigned_short(dst, src);
2345     shll(dst, 16);
2346     sarl(dst, 16);
2347   }
2348   return off;
2349 }
2350 
2351 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2352   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2353   // and "3.9 Partial Register Penalties", p. 22).
2354   int off;
2355   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2356     off = offset();
2357     movzbl(dst, src); // movzxb
2358   } else {
2359     xorl(dst, dst);
2360     off = offset();
2361     movb(dst, src);
2362   }
2363   return off;
2364 }
2365 
2366 // Note: load_unsigned_short used to be called load_unsigned_word.
2367 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2368   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2369   // and "3.9 Partial Register Penalties", p. 22).
2370   int off;
2371   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2372     off = offset();
2373     movzwl(dst, src); // movzxw
2374   } else {
2375     xorl(dst, dst);
2376     off = offset();
2377     movw(dst, src);
2378   }
2379   return off;
2380 }
2381 
2382 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2383   switch (size_in_bytes) {
2384 #ifndef _LP64
2385   case  8:
2386     assert(dst2 != noreg, "second dest register required");
2387     movl(dst,  src);
2388     movl(dst2, src.plus_disp(BytesPerInt));
2389     break;
2390 #else
2391   case  8:  movq(dst, src); break;
2392 #endif
2393   case  4:  movl(dst, src); break;
2394   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2395   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2396   default:  ShouldNotReachHere();
2397   }
2398 }
2399 
2400 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2401   switch (size_in_bytes) {
2402 #ifndef _LP64
2403   case  8:
2404     assert(src2 != noreg, "second source register required");
2405     movl(dst,                        src);
2406     movl(dst.plus_disp(BytesPerInt), src2);
2407     break;
2408 #else
2409   case  8:  movq(dst, src); break;
2410 #endif
2411   case  4:  movl(dst, src); break;
2412   case  2:  movw(dst, src); break;
2413   case  1:  movb(dst, src); break;
2414   default:  ShouldNotReachHere();
2415   }
2416 }
2417 
2418 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2419   if (reachable(dst)) {
2420     movl(as_Address(dst), src);
2421   } else {
2422     lea(rscratch1, dst);
2423     movl(Address(rscratch1, 0), src);
2424   }
2425 }
2426 
2427 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2428   if (reachable(src)) {
2429     movl(dst, as_Address(src));
2430   } else {
2431     lea(rscratch1, src);
2432     movl(dst, Address(rscratch1, 0));
2433   }
2434 }
2435 
2436 // C++ bool manipulation
2437 
2438 void MacroAssembler::movbool(Register dst, Address src) {
2439   if(sizeof(bool) == 1)
2440     movb(dst, src);
2441   else if(sizeof(bool) == 2)
2442     movw(dst, src);
2443   else if(sizeof(bool) == 4)
2444     movl(dst, src);
2445   else
2446     // unsupported
2447     ShouldNotReachHere();
2448 }
2449 
2450 void MacroAssembler::movbool(Address dst, bool boolconst) {
2451   if(sizeof(bool) == 1)
2452     movb(dst, (int) boolconst);
2453   else if(sizeof(bool) == 2)
2454     movw(dst, (int) boolconst);
2455   else if(sizeof(bool) == 4)
2456     movl(dst, (int) boolconst);
2457   else
2458     // unsupported
2459     ShouldNotReachHere();
2460 }
2461 
2462 void MacroAssembler::movbool(Address dst, Register src) {
2463   if(sizeof(bool) == 1)
2464     movb(dst, src);
2465   else if(sizeof(bool) == 2)
2466     movw(dst, src);
2467   else if(sizeof(bool) == 4)
2468     movl(dst, src);
2469   else
2470     // unsupported
2471     ShouldNotReachHere();
2472 }
2473 
2474 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2475   movb(as_Address(dst), src);
2476 }
2477 
2478 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2479   if (reachable(src)) {
2480     movdl(dst, as_Address(src));
2481   } else {
2482     lea(rscratch1, src);
2483     movdl(dst, Address(rscratch1, 0));
2484   }
2485 }
2486 
2487 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2488   if (reachable(src)) {
2489     movq(dst, as_Address(src));
2490   } else {
2491     lea(rscratch1, src);
2492     movq(dst, Address(rscratch1, 0));
2493   }
2494 }
2495 
2496 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2497   if (reachable(src)) {
2498     if (UseXmmLoadAndClearUpper) {
2499       movsd (dst, as_Address(src));
2500     } else {
2501       movlpd(dst, as_Address(src));
2502     }
2503   } else {
2504     lea(rscratch1, src);
2505     if (UseXmmLoadAndClearUpper) {
2506       movsd (dst, Address(rscratch1, 0));
2507     } else {
2508       movlpd(dst, Address(rscratch1, 0));
2509     }
2510   }
2511 }
2512 
2513 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2514   if (reachable(src)) {
2515     movss(dst, as_Address(src));
2516   } else {
2517     lea(rscratch1, src);
2518     movss(dst, Address(rscratch1, 0));
2519   }
2520 }
2521 
2522 void MacroAssembler::movptr(Register dst, Register src) {
2523   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2524 }
2525 
2526 void MacroAssembler::movptr(Register dst, Address src) {
2527   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2528 }
2529 
2530 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2531 void MacroAssembler::movptr(Register dst, intptr_t src) {
2532   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2533 }
2534 
2535 void MacroAssembler::movptr(Address dst, Register src) {
2536   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2537 }
2538 
2539 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2540     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2541     Assembler::movdqu(dst, src);
2542 }
2543 
2544 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2545     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2546     Assembler::movdqu(dst, src);
2547 }
2548 
2549 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2550     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2551     Assembler::movdqu(dst, src);
2552 }
2553 
2554 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2555   if (reachable(src)) {
2556     movdqu(dst, as_Address(src));
2557   } else {
2558     lea(scratchReg, src);
2559     movdqu(dst, Address(scratchReg, 0));
2560   }
2561 }
2562 
2563 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2564     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2565     Assembler::vmovdqu(dst, src);
2566 }
2567 
2568 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2569     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2570     Assembler::vmovdqu(dst, src);
2571 }
2572 
2573 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2574     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2575     Assembler::vmovdqu(dst, src);
2576 }
2577 
2578 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2579   if (reachable(src)) {
2580     vmovdqu(dst, as_Address(src));
2581   }
2582   else {
2583     lea(scratch_reg, src);
2584     vmovdqu(dst, Address(scratch_reg, 0));
2585   }
2586 }
2587 
2588 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len) {
2589   if (vector_len == AVX_512bit) {
2590     evmovdquq(dst, src, AVX_512bit, scratch_reg);
2591   } else if (vector_len == AVX_256bit) {
2592     vmovdqu(dst, src, scratch_reg);
2593   } else {
2594     movdqu(dst, src, scratch_reg);
2595   }
2596 }
2597 
2598 void MacroAssembler::kmov(KRegister dst, Address src) {
2599   if (VM_Version::supports_avx512bw()) {
2600     kmovql(dst, src);
2601   } else {
2602     assert(VM_Version::supports_evex(), "");
2603     kmovwl(dst, src);
2604   }
2605 }
2606 
2607 void MacroAssembler::kmov(Address dst, KRegister src) {
2608   if (VM_Version::supports_avx512bw()) {
2609     kmovql(dst, src);
2610   } else {
2611     assert(VM_Version::supports_evex(), "");
2612     kmovwl(dst, src);
2613   }
2614 }
2615 
2616 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2617   if (VM_Version::supports_avx512bw()) {
2618     kmovql(dst, src);
2619   } else {
2620     assert(VM_Version::supports_evex(), "");
2621     kmovwl(dst, src);
2622   }
2623 }
2624 
2625 void MacroAssembler::kmov(Register dst, KRegister src) {
2626   if (VM_Version::supports_avx512bw()) {
2627     kmovql(dst, src);
2628   } else {
2629     assert(VM_Version::supports_evex(), "");
2630     kmovwl(dst, src);
2631   }
2632 }
2633 
2634 void MacroAssembler::kmov(KRegister dst, Register src) {
2635   if (VM_Version::supports_avx512bw()) {
2636     kmovql(dst, src);
2637   } else {
2638     assert(VM_Version::supports_evex(), "");
2639     kmovwl(dst, src);
2640   }
2641 }
2642 
2643 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2644   if (reachable(src)) {
2645     kmovql(dst, as_Address(src));
2646   } else {
2647     lea(scratch_reg, src);
2648     kmovql(dst, Address(scratch_reg, 0));
2649   }
2650 }
2651 
2652 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2653   if (reachable(src)) {
2654     kmovwl(dst, as_Address(src));
2655   } else {
2656     lea(scratch_reg, src);
2657     kmovwl(dst, Address(scratch_reg, 0));
2658   }
2659 }
2660 
2661 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2662                                int vector_len, Register scratch_reg) {
2663   if (reachable(src)) {
2664     Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2665   } else {
2666     lea(scratch_reg, src);
2667     Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2668   }
2669 }
2670 
2671 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2672                                int vector_len, Register scratch_reg) {
2673   if (reachable(src)) {
2674     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2675   } else {
2676     lea(scratch_reg, src);
2677     Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2678   }
2679 }
2680 
2681 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2682                                int vector_len, Register scratch_reg) {
2683   if (reachable(src)) {
2684     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2685   } else {
2686     lea(scratch_reg, src);
2687     Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2688   }
2689 }
2690 
2691 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2692                                int vector_len, Register scratch_reg) {
2693   if (reachable(src)) {
2694     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2695   } else {
2696     lea(scratch_reg, src);
2697     Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2698   }
2699 }
2700 
2701 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2702   if (reachable(src)) {
2703     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2704   } else {
2705     lea(rscratch, src);
2706     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2707   }
2708 }
2709 
2710 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2711   if (reachable(src)) {
2712     Assembler::movdqa(dst, as_Address(src));
2713   } else {
2714     lea(rscratch1, src);
2715     Assembler::movdqa(dst, Address(rscratch1, 0));
2716   }
2717 }
2718 
2719 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2720   if (reachable(src)) {
2721     Assembler::movsd(dst, as_Address(src));
2722   } else {
2723     lea(rscratch1, src);
2724     Assembler::movsd(dst, Address(rscratch1, 0));
2725   }
2726 }
2727 
2728 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2729   if (reachable(src)) {
2730     Assembler::movss(dst, as_Address(src));
2731   } else {
2732     lea(rscratch1, src);
2733     Assembler::movss(dst, Address(rscratch1, 0));
2734   }
2735 }
2736 
2737 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2738   if (reachable(src)) {
2739     Assembler::vmovddup(dst, as_Address(src), vector_len);
2740   } else {
2741     lea(rscratch, src);
2742     Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2743   }
2744 }
2745 
2746 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2747   if (reachable(src)) {
2748     Assembler::mulsd(dst, as_Address(src));
2749   } else {
2750     lea(rscratch1, src);
2751     Assembler::mulsd(dst, Address(rscratch1, 0));
2752   }
2753 }
2754 
2755 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2756   if (reachable(src)) {
2757     Assembler::mulss(dst, as_Address(src));
2758   } else {
2759     lea(rscratch1, src);
2760     Assembler::mulss(dst, Address(rscratch1, 0));
2761   }
2762 }
2763 
2764 void MacroAssembler::null_check(Register reg, int offset) {
2765   if (needs_explicit_null_check(offset)) {
2766     // provoke OS NULL exception if reg = NULL by
2767     // accessing M[reg] w/o changing any (non-CC) registers
2768     // NOTE: cmpl is plenty here to provoke a segv
2769     cmpptr(rax, Address(reg, 0));
2770     // Note: should probably use testl(rax, Address(reg, 0));
2771     //       may be shorter code (however, this version of
2772     //       testl needs to be implemented first)
2773   } else {
2774     // nothing to do, (later) access of M[reg + offset]
2775     // will provoke OS NULL exception if reg = NULL
2776   }
2777 }
2778 
2779 void MacroAssembler::os_breakpoint() {
2780   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2781   // (e.g., MSVC can't call ps() otherwise)
2782   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2783 }
2784 
2785 void MacroAssembler::unimplemented(const char* what) {
2786   const char* buf = NULL;
2787   {
2788     ResourceMark rm;
2789     stringStream ss;
2790     ss.print("unimplemented: %s", what);
2791     buf = code_string(ss.as_string());
2792   }
2793   stop(buf);
2794 }
2795 
2796 #ifdef _LP64
2797 #define XSTATE_BV 0x200
2798 #endif
2799 
2800 void MacroAssembler::pop_CPU_state() {
2801   pop_FPU_state();
2802   pop_IU_state();
2803 }
2804 
2805 void MacroAssembler::pop_FPU_state() {
2806 #ifndef _LP64
2807   frstor(Address(rsp, 0));
2808 #else
2809   fxrstor(Address(rsp, 0));
2810 #endif
2811   addptr(rsp, FPUStateSizeInWords * wordSize);
2812 }
2813 
2814 void MacroAssembler::pop_IU_state() {
2815   popa();
2816   LP64_ONLY(addq(rsp, 8));
2817   popf();
2818 }
2819 
2820 // Save Integer and Float state
2821 // Warning: Stack must be 16 byte aligned (64bit)
2822 void MacroAssembler::push_CPU_state() {
2823   push_IU_state();
2824   push_FPU_state();
2825 }
2826 
2827 void MacroAssembler::push_FPU_state() {
2828   subptr(rsp, FPUStateSizeInWords * wordSize);
2829 #ifndef _LP64
2830   fnsave(Address(rsp, 0));
2831   fwait();
2832 #else
2833   fxsave(Address(rsp, 0));
2834 #endif // LP64
2835 }
2836 
2837 void MacroAssembler::push_IU_state() {
2838   // Push flags first because pusha kills them
2839   pushf();
2840   // Make sure rsp stays 16-byte aligned
2841   LP64_ONLY(subq(rsp, 8));
2842   pusha();
2843 }
2844 
2845 void MacroAssembler::push_cont_fastpath() {
2846   if (!Continuations::enabled()) return;
2847 
2848 #ifndef _LP64
2849   Register rthread = rax;
2850   Register rrealsp = rbx;
2851   push(rthread);
2852   push(rrealsp);
2853 
2854   get_thread(rthread);
2855 
2856   // The code below wants the original RSP.
2857   // Move it back after the pushes above.
2858   movptr(rrealsp, rsp);
2859   addptr(rrealsp, 2*wordSize);
2860 #else
2861   Register rthread = r15_thread;
2862   Register rrealsp = rsp;
2863 #endif
2864 
2865   Label done;
2866   cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
2867   jccb(Assembler::belowEqual, done);
2868   movptr(Address(rthread, JavaThread::cont_fastpath_offset()), rrealsp);
2869   bind(done);
2870 
2871 #ifndef _LP64
2872   pop(rrealsp);
2873   pop(rthread);
2874 #endif
2875 }
2876 
2877 void MacroAssembler::pop_cont_fastpath() {
2878   if (!Continuations::enabled()) return;
2879 
2880 #ifndef _LP64
2881   Register rthread = rax;
2882   Register rrealsp = rbx;
2883   push(rthread);
2884   push(rrealsp);
2885 
2886   get_thread(rthread);
2887 
2888   // The code below wants the original RSP.
2889   // Move it back after the pushes above.
2890   movptr(rrealsp, rsp);
2891   addptr(rrealsp, 2*wordSize);
2892 #else
2893   Register rthread = r15_thread;
2894   Register rrealsp = rsp;
2895 #endif
2896 
2897   Label done;
2898   cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
2899   jccb(Assembler::below, done);
2900   movptr(Address(rthread, JavaThread::cont_fastpath_offset()), 0);
2901   bind(done);
2902 
2903 #ifndef _LP64
2904   pop(rrealsp);
2905   pop(rthread);
2906 #endif
2907 }
2908 
2909 void MacroAssembler::inc_held_monitor_count() {
2910 #ifndef _LP64
2911   Register thread = rax;
2912   push(thread);
2913   get_thread(thread);
2914   incrementl(Address(thread, JavaThread::held_monitor_count_offset()));
2915   pop(thread);
2916 #else // LP64
2917   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
2918 #endif
2919 }
2920 
2921 void MacroAssembler::dec_held_monitor_count() {
2922 #ifndef _LP64
2923   Register thread = rax;
2924   push(thread);
2925   get_thread(thread);
2926   decrementl(Address(thread, JavaThread::held_monitor_count_offset()));
2927   pop(thread);
2928 #else // LP64
2929   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
2930 #endif
2931 }
2932 
2933 #ifdef ASSERT
2934 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
2935 #ifdef _LP64
2936   Label no_cont;
2937   movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
2938   testl(cont, cont);
2939   jcc(Assembler::zero, no_cont);
2940   stop(name);
2941   bind(no_cont);
2942 #else
2943   Unimplemented();
2944 #endif
2945 }
2946 #endif
2947 
2948 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2949   if (!java_thread->is_valid()) {
2950     java_thread = rdi;
2951     get_thread(java_thread);
2952   }
2953   // we must set sp to zero to clear frame
2954   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2955   // must clear fp, so that compiled frames are not confused; it is
2956   // possible that we need it only for debugging
2957   if (clear_fp) {
2958     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2959   }
2960   // Always clear the pc because it could have been set by make_walkable()
2961   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2962   vzeroupper();
2963 }
2964 
2965 void MacroAssembler::restore_rax(Register tmp) {
2966   if (tmp == noreg) pop(rax);
2967   else if (tmp != rax) mov(rax, tmp);
2968 }
2969 
2970 void MacroAssembler::round_to(Register reg, int modulus) {
2971   addptr(reg, modulus - 1);
2972   andptr(reg, -modulus);
2973 }
2974 
2975 void MacroAssembler::save_rax(Register tmp) {
2976   if (tmp == noreg) push(rax);
2977   else if (tmp != rax) mov(tmp, rax);
2978 }
2979 
2980 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
2981   if (at_return) {
2982     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
2983     // we may safely use rsp instead to perform the stack watermark check.
2984     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
2985     jcc(Assembler::above, slow_path);
2986     return;
2987   }
2988   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2989   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2990 }
2991 
2992 // Calls to C land
2993 //
2994 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2995 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2996 // has to be reset to 0. This is required to allow proper stack traversal.
2997 void MacroAssembler::set_last_Java_frame(Register java_thread,
2998                                          Register last_java_sp,
2999                                          Register last_java_fp,
3000                                          address  last_java_pc) {
3001   vzeroupper();
3002   // determine java_thread register
3003   if (!java_thread->is_valid()) {
3004     java_thread = rdi;
3005     get_thread(java_thread);
3006   }
3007   // determine last_java_sp register
3008   if (!last_java_sp->is_valid()) {
3009     last_java_sp = rsp;
3010   }
3011 
3012   // last_java_fp is optional
3013 
3014   if (last_java_fp->is_valid()) {
3015     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3016   }
3017 
3018   // last_java_pc is optional
3019 
3020   if (last_java_pc != NULL) {
3021     lea(Address(java_thread,
3022                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3023         InternalAddress(last_java_pc));
3024 
3025   }
3026   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3027 }
3028 
3029 void MacroAssembler::shlptr(Register dst, int imm8) {
3030   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3031 }
3032 
3033 void MacroAssembler::shrptr(Register dst, int imm8) {
3034   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3035 }
3036 
3037 void MacroAssembler::sign_extend_byte(Register reg) {
3038   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3039     movsbl(reg, reg); // movsxb
3040   } else {
3041     shll(reg, 24);
3042     sarl(reg, 24);
3043   }
3044 }
3045 
3046 void MacroAssembler::sign_extend_short(Register reg) {
3047   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3048     movswl(reg, reg); // movsxw
3049   } else {
3050     shll(reg, 16);
3051     sarl(reg, 16);
3052   }
3053 }
3054 
3055 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3056   assert(reachable(src), "Address should be reachable");
3057   testl(dst, as_Address(src));
3058 }
3059 
3060 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3061   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3062   Assembler::pcmpeqb(dst, src);
3063 }
3064 
3065 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3066   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3067   Assembler::pcmpeqw(dst, src);
3068 }
3069 
3070 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3071   assert((dst->encoding() < 16),"XMM register should be 0-15");
3072   Assembler::pcmpestri(dst, src, imm8);
3073 }
3074 
3075 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3076   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3077   Assembler::pcmpestri(dst, src, imm8);
3078 }
3079 
3080 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3081   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3082   Assembler::pmovzxbw(dst, src);
3083 }
3084 
3085 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3086   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3087   Assembler::pmovzxbw(dst, src);
3088 }
3089 
3090 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3091   assert((src->encoding() < 16),"XMM register should be 0-15");
3092   Assembler::pmovmskb(dst, src);
3093 }
3094 
3095 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3096   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3097   Assembler::ptest(dst, src);
3098 }
3099 
3100 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3101   if (reachable(src)) {
3102     Assembler::sqrtsd(dst, as_Address(src));
3103   } else {
3104     lea(rscratch1, src);
3105     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3106   }
3107 }
3108 
3109 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3110   if (reachable(src)) {
3111     Assembler::sqrtss(dst, as_Address(src));
3112   } else {
3113     lea(rscratch1, src);
3114     Assembler::sqrtss(dst, Address(rscratch1, 0));
3115   }
3116 }
3117 
3118 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3119   if (reachable(src)) {
3120     Assembler::subsd(dst, as_Address(src));
3121   } else {
3122     lea(rscratch1, src);
3123     Assembler::subsd(dst, Address(rscratch1, 0));
3124   }
3125 }
3126 
3127 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
3128   if (reachable(src)) {
3129     Assembler::roundsd(dst, as_Address(src), rmode);
3130   } else {
3131     lea(scratch_reg, src);
3132     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
3133   }
3134 }
3135 
3136 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3137   if (reachable(src)) {
3138     Assembler::subss(dst, as_Address(src));
3139   } else {
3140     lea(rscratch1, src);
3141     Assembler::subss(dst, Address(rscratch1, 0));
3142   }
3143 }
3144 
3145 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3146   if (reachable(src)) {
3147     Assembler::ucomisd(dst, as_Address(src));
3148   } else {
3149     lea(rscratch1, src);
3150     Assembler::ucomisd(dst, Address(rscratch1, 0));
3151   }
3152 }
3153 
3154 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3155   if (reachable(src)) {
3156     Assembler::ucomiss(dst, as_Address(src));
3157   } else {
3158     lea(rscratch1, src);
3159     Assembler::ucomiss(dst, Address(rscratch1, 0));
3160   }
3161 }
3162 
3163 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3164   // Used in sign-bit flipping with aligned address.
3165   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3166   if (reachable(src)) {
3167     Assembler::xorpd(dst, as_Address(src));
3168   } else {
3169     lea(scratch_reg, src);
3170     Assembler::xorpd(dst, Address(scratch_reg, 0));
3171   }
3172 }
3173 
3174 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3175   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3176     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3177   }
3178   else {
3179     Assembler::xorpd(dst, src);
3180   }
3181 }
3182 
3183 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3184   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3185     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3186   } else {
3187     Assembler::xorps(dst, src);
3188   }
3189 }
3190 
3191 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3192   // Used in sign-bit flipping with aligned address.
3193   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3194   if (reachable(src)) {
3195     Assembler::xorps(dst, as_Address(src));
3196   } else {
3197     lea(scratch_reg, src);
3198     Assembler::xorps(dst, Address(scratch_reg, 0));
3199   }
3200 }
3201 
3202 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3203   // Used in sign-bit flipping with aligned address.
3204   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3205   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3206   if (reachable(src)) {
3207     Assembler::pshufb(dst, as_Address(src));
3208   } else {
3209     lea(rscratch1, src);
3210     Assembler::pshufb(dst, Address(rscratch1, 0));
3211   }
3212 }
3213 
3214 // AVX 3-operands instructions
3215 
3216 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3217   if (reachable(src)) {
3218     vaddsd(dst, nds, as_Address(src));
3219   } else {
3220     lea(rscratch1, src);
3221     vaddsd(dst, nds, Address(rscratch1, 0));
3222   }
3223 }
3224 
3225 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3226   if (reachable(src)) {
3227     vaddss(dst, nds, as_Address(src));
3228   } else {
3229     lea(rscratch1, src);
3230     vaddss(dst, nds, Address(rscratch1, 0));
3231   }
3232 }
3233 
3234 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3235   assert(UseAVX > 0, "requires some form of AVX");
3236   if (reachable(src)) {
3237     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3238   } else {
3239     lea(rscratch, src);
3240     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3241   }
3242 }
3243 
3244 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3245   assert(UseAVX > 0, "requires some form of AVX");
3246   if (reachable(src)) {
3247     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3248   } else {
3249     lea(rscratch, src);
3250     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3251   }
3252 }
3253 
3254 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3255   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3256   vandps(dst, nds, negate_field, vector_len);
3257 }
3258 
3259 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3260   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3261   vandpd(dst, nds, negate_field, vector_len);
3262 }
3263 
3264 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3265   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3266   Assembler::vpaddb(dst, nds, src, vector_len);
3267 }
3268 
3269 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3270   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3271   Assembler::vpaddb(dst, nds, src, vector_len);
3272 }
3273 
3274 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3275   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3276   Assembler::vpaddw(dst, nds, src, vector_len);
3277 }
3278 
3279 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3280   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3281   Assembler::vpaddw(dst, nds, src, vector_len);
3282 }
3283 
3284 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3285   if (reachable(src)) {
3286     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3287   } else {
3288     lea(scratch_reg, src);
3289     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3290   }
3291 }
3292 
3293 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3294   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3295   Assembler::vpbroadcastw(dst, src, vector_len);
3296 }
3297 
3298 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3299   if (reachable(src)) {
3300     Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
3301   } else {
3302     lea(rscratch, src);
3303     Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
3304   }
3305 }
3306 
3307 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3308   if (reachable(src)) {
3309     Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3310   } else {
3311     lea(rscratch, src);
3312     Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3313   }
3314 }
3315 
3316 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3317   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3318   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3319 }
3320 
3321 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3322   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3323   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3324 }
3325 
3326 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3327                                AddressLiteral src, int vector_len, Register scratch_reg) {
3328   if (reachable(src)) {
3329     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3330   } else {
3331     lea(scratch_reg, src);
3332     Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3333   }
3334 }
3335 
3336 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3337                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3338   if (reachable(src)) {
3339     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3340   } else {
3341     lea(scratch_reg, src);
3342     Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3343   }
3344 }
3345 
3346 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3347                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3348   if (reachable(src)) {
3349     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3350   } else {
3351     lea(scratch_reg, src);
3352     Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3353   }
3354 }
3355 
3356 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3357                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3358   if (reachable(src)) {
3359     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3360   } else {
3361     lea(scratch_reg, src);
3362     Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3363   }
3364 }
3365 
3366 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3367                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3368   if (reachable(src)) {
3369     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3370   } else {
3371     lea(scratch_reg, src);
3372     Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3373   }
3374 }
3375 
3376 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3377   if (width == Assembler::Q) {
3378     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3379   } else {
3380     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3381   }
3382 }
3383 
3384 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3385   int eq_cond_enc = 0x29;
3386   int gt_cond_enc = 0x37;
3387   if (width != Assembler::Q) {
3388     eq_cond_enc = 0x74 + width;
3389     gt_cond_enc = 0x64 + width;
3390   }
3391   switch (cond) {
3392   case eq:
3393     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3394     break;
3395   case neq:
3396     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3397     vallones(xtmp, vector_len);
3398     vpxor(dst, xtmp, dst, vector_len);
3399     break;
3400   case le:
3401     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3402     vallones(xtmp, vector_len);
3403     vpxor(dst, xtmp, dst, vector_len);
3404     break;
3405   case nlt:
3406     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3407     vallones(xtmp, vector_len);
3408     vpxor(dst, xtmp, dst, vector_len);
3409     break;
3410   case lt:
3411     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3412     break;
3413   case nle:
3414     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3415     break;
3416   default:
3417     assert(false, "Should not reach here");
3418   }
3419 }
3420 
3421 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3422   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3423   Assembler::vpmovzxbw(dst, src, vector_len);
3424 }
3425 
3426 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3427   assert((src->encoding() < 16),"XMM register should be 0-15");
3428   Assembler::vpmovmskb(dst, src, vector_len);
3429 }
3430 
3431 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3432   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3433   Assembler::vpmullw(dst, nds, src, vector_len);
3434 }
3435 
3436 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3437   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3438   Assembler::vpmullw(dst, nds, src, vector_len);
3439 }
3440 
3441 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3442   assert((UseAVX > 0), "AVX support is needed");
3443   if (reachable(src)) {
3444     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3445   } else {
3446     lea(scratch_reg, src);
3447     Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3448   }
3449 }
3450 
3451 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3452   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3453   Assembler::vpsubb(dst, nds, src, vector_len);
3454 }
3455 
3456 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3457   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3458   Assembler::vpsubb(dst, nds, src, vector_len);
3459 }
3460 
3461 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3462   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3463   Assembler::vpsubw(dst, nds, src, vector_len);
3464 }
3465 
3466 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3467   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3468   Assembler::vpsubw(dst, nds, src, vector_len);
3469 }
3470 
3471 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3472   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3473   Assembler::vpsraw(dst, nds, shift, vector_len);
3474 }
3475 
3476 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3477   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3478   Assembler::vpsraw(dst, nds, shift, vector_len);
3479 }
3480 
3481 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3482   assert(UseAVX > 2,"");
3483   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3484      vector_len = 2;
3485   }
3486   Assembler::evpsraq(dst, nds, shift, vector_len);
3487 }
3488 
3489 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3490   assert(UseAVX > 2,"");
3491   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3492      vector_len = 2;
3493   }
3494   Assembler::evpsraq(dst, nds, shift, vector_len);
3495 }
3496 
3497 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3498   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3499   Assembler::vpsrlw(dst, nds, shift, vector_len);
3500 }
3501 
3502 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3503   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3504   Assembler::vpsrlw(dst, nds, shift, vector_len);
3505 }
3506 
3507 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3508   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3509   Assembler::vpsllw(dst, nds, shift, vector_len);
3510 }
3511 
3512 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3513   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3514   Assembler::vpsllw(dst, nds, shift, vector_len);
3515 }
3516 
3517 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3518   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3519   Assembler::vptest(dst, src);
3520 }
3521 
3522 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3523   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3524   Assembler::punpcklbw(dst, src);
3525 }
3526 
3527 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3528   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3529   Assembler::pshufd(dst, src, mode);
3530 }
3531 
3532 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3533   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3534   Assembler::pshuflw(dst, src, mode);
3535 }
3536 
3537 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3538   if (reachable(src)) {
3539     vandpd(dst, nds, as_Address(src), vector_len);
3540   } else {
3541     lea(scratch_reg, src);
3542     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3543   }
3544 }
3545 
3546 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3547   if (reachable(src)) {
3548     vandps(dst, nds, as_Address(src), vector_len);
3549   } else {
3550     lea(scratch_reg, src);
3551     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3552   }
3553 }
3554 
3555 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3556                             bool merge, int vector_len, Register scratch_reg) {
3557   if (reachable(src)) {
3558     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3559   } else {
3560     lea(scratch_reg, src);
3561     Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3562   }
3563 }
3564 
3565 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3566   if (reachable(src)) {
3567     vdivsd(dst, nds, as_Address(src));
3568   } else {
3569     lea(rscratch1, src);
3570     vdivsd(dst, nds, Address(rscratch1, 0));
3571   }
3572 }
3573 
3574 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3575   if (reachable(src)) {
3576     vdivss(dst, nds, as_Address(src));
3577   } else {
3578     lea(rscratch1, src);
3579     vdivss(dst, nds, Address(rscratch1, 0));
3580   }
3581 }
3582 
3583 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3584   if (reachable(src)) {
3585     vmulsd(dst, nds, as_Address(src));
3586   } else {
3587     lea(rscratch1, src);
3588     vmulsd(dst, nds, Address(rscratch1, 0));
3589   }
3590 }
3591 
3592 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3593   if (reachable(src)) {
3594     vmulss(dst, nds, as_Address(src));
3595   } else {
3596     lea(rscratch1, src);
3597     vmulss(dst, nds, Address(rscratch1, 0));
3598   }
3599 }
3600 
3601 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3602   if (reachable(src)) {
3603     vsubsd(dst, nds, as_Address(src));
3604   } else {
3605     lea(rscratch1, src);
3606     vsubsd(dst, nds, Address(rscratch1, 0));
3607   }
3608 }
3609 
3610 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3611   if (reachable(src)) {
3612     vsubss(dst, nds, as_Address(src));
3613   } else {
3614     lea(rscratch1, src);
3615     vsubss(dst, nds, Address(rscratch1, 0));
3616   }
3617 }
3618 
3619 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3620   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3621   vxorps(dst, nds, src, Assembler::AVX_128bit);
3622 }
3623 
3624 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3625   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3626   vxorpd(dst, nds, src, Assembler::AVX_128bit);
3627 }
3628 
3629 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3630   if (reachable(src)) {
3631     vxorpd(dst, nds, as_Address(src), vector_len);
3632   } else {
3633     lea(scratch_reg, src);
3634     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3635   }
3636 }
3637 
3638 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3639   if (reachable(src)) {
3640     vxorps(dst, nds, as_Address(src), vector_len);
3641   } else {
3642     lea(scratch_reg, src);
3643     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3644   }
3645 }
3646 
3647 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3648   if (UseAVX > 1 || (vector_len < 1)) {
3649     if (reachable(src)) {
3650       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3651     } else {
3652       lea(scratch_reg, src);
3653       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3654     }
3655   }
3656   else {
3657     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3658   }
3659 }
3660 
3661 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3662   if (reachable(src)) {
3663     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3664   } else {
3665     lea(scratch_reg, src);
3666     Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3667   }
3668 }
3669 
3670 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3671   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3672   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3673   // The inverted mask is sign-extended
3674   andptr(possibly_jweak, inverted_jweak_mask);
3675 }
3676 
3677 void MacroAssembler::resolve_jobject(Register value,
3678                                      Register thread,
3679                                      Register tmp) {
3680   assert_different_registers(value, thread, tmp);
3681   Label done, not_weak;
3682   testptr(value, value);
3683   jcc(Assembler::zero, done);                // Use NULL as-is.
3684   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3685   jcc(Assembler::zero, not_weak);
3686   // Resolve jweak.
3687   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3688                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3689   verify_oop(value);
3690   jmp(done);
3691   bind(not_weak);
3692   // Resolve (untagged) jobject.
3693   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3694   verify_oop(value);
3695   bind(done);
3696 }
3697 
3698 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3699   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3700 }
3701 
3702 // Force generation of a 4 byte immediate value even if it fits into 8bit
3703 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3704   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3705 }
3706 
3707 void MacroAssembler::subptr(Register dst, Register src) {
3708   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3709 }
3710 
3711 // C++ bool manipulation
3712 void MacroAssembler::testbool(Register dst) {
3713   if(sizeof(bool) == 1)
3714     testb(dst, 0xff);
3715   else if(sizeof(bool) == 2) {
3716     // testw implementation needed for two byte bools
3717     ShouldNotReachHere();
3718   } else if(sizeof(bool) == 4)
3719     testl(dst, dst);
3720   else
3721     // unsupported
3722     ShouldNotReachHere();
3723 }
3724 
3725 void MacroAssembler::testptr(Register dst, Register src) {
3726   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3727 }
3728 
3729 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3730 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3731                                    Register var_size_in_bytes,
3732                                    int con_size_in_bytes,
3733                                    Register t1,
3734                                    Register t2,
3735                                    Label& slow_case) {
3736   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3737   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3738 }
3739 
3740 RegSet MacroAssembler::call_clobbered_gp_registers() {
3741   RegSet regs;
3742 #ifdef _LP64
3743   regs += RegSet::of(rax, rcx, rdx);
3744 #ifndef WINDOWS
3745   regs += RegSet::of(rsi, rdi);
3746 #endif
3747   regs += RegSet::range(r8, r11);
3748 #else
3749   regs += RegSet::of(rax, rcx, rdx);
3750 #endif
3751   return regs;
3752 }
3753 
3754 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
3755   int num_xmm_registers = XMMRegisterImpl::available_xmm_registers();
3756 #if defined(WINDOWS) && defined(_LP64)
3757   XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
3758   if (num_xmm_registers > 16) {
3759      result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
3760   }
3761   return result;
3762 #else
3763   return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
3764 #endif
3765 }
3766 
3767 static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor
3768 
3769 #ifndef _LP64
3770 static bool use_x87_registers() { return UseSSE < 2; }
3771 #endif
3772 static bool use_xmm_registers() { return UseSSE >= 1; }
3773 
3774 // C1 only ever uses the first double/float of the XMM register.
3775 static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); }
3776 
3777 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3778   if (UseSSE == 1) {
3779     masm->movflt(Address(rsp, offset), reg);
3780   } else {
3781     masm->movdbl(Address(rsp, offset), reg);
3782   }
3783 }
3784 
3785 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
3786   if (UseSSE == 1) {
3787     masm->movflt(reg, Address(rsp, offset));
3788   } else {
3789     masm->movdbl(reg, Address(rsp, offset));
3790   }
3791 }
3792 
3793 int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers, bool save_fpu,
3794                            int& gp_area_size, int& fp_area_size, int& xmm_area_size) {
3795 
3796   gp_area_size = align_up(gp_registers.size() * RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size,
3797                          StackAlignmentInBytes);
3798 #ifdef _LP64
3799   fp_area_size = 0;
3800 #else
3801   fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0;
3802 #endif
3803   xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0;
3804 
3805   return gp_area_size + fp_area_size + xmm_area_size;
3806 }
3807 
3808 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
3809   block_comment("push_call_clobbered_registers start");
3810   // Regular registers
3811   RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
3812 
3813   int gp_area_size;
3814   int fp_area_size;
3815   int xmm_area_size;
3816   int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
3817                                                gp_area_size, fp_area_size, xmm_area_size);
3818   subptr(rsp, total_save_size);
3819 
3820   push_set(gp_registers_to_push, 0);
3821 
3822 #ifndef _LP64
3823   if (save_fpu && use_x87_registers()) {
3824     fnsave(Address(rsp, gp_area_size));
3825     fwait();
3826   }
3827 #endif
3828   if (save_fpu && use_xmm_registers()) {
3829     push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
3830   }
3831 
3832   block_comment("push_call_clobbered_registers end");
3833 }
3834 
3835 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
3836   block_comment("pop_call_clobbered_registers start");
3837 
3838   RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
3839 
3840   int gp_area_size;
3841   int fp_area_size;
3842   int xmm_area_size;
3843   int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
3844                                                gp_area_size, fp_area_size, xmm_area_size);
3845 
3846   if (restore_fpu && use_xmm_registers()) {
3847     pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
3848   }
3849 #ifndef _LP64
3850   if (restore_fpu && use_x87_registers()) {
3851     frstor(Address(rsp, gp_area_size));
3852   }
3853 #endif
3854 
3855   pop_set(gp_registers_to_pop, 0);
3856 
3857   addptr(rsp, total_save_size);
3858 
3859   vzeroupper();
3860 
3861   block_comment("pop_call_clobbered_registers end");
3862 }
3863 
3864 void MacroAssembler::push_set(XMMRegSet set, int offset) {
3865   assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
3866   int spill_offset = offset;
3867 
3868   for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
3869     save_xmm_register(this, spill_offset, *it);
3870     spill_offset += xmm_save_size();
3871   }
3872 }
3873 
3874 void MacroAssembler::pop_set(XMMRegSet set, int offset) {
3875   int restore_size = set.size() * xmm_save_size();
3876   assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
3877 
3878   int restore_offset = offset + restore_size - xmm_save_size();
3879 
3880   for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
3881     restore_xmm_register(this, restore_offset, *it);
3882     restore_offset -= xmm_save_size();
3883   }
3884 }
3885 
3886 void MacroAssembler::push_set(RegSet set, int offset) {
3887   int spill_offset;
3888   if (offset == -1) {
3889     int register_push_size = set.size() * RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size;
3890     int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
3891     subptr(rsp, aligned_size);
3892     spill_offset = 0;
3893   } else {
3894     spill_offset = offset;
3895   }
3896 
3897   for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
3898     movptr(Address(rsp, spill_offset), *it);
3899     spill_offset += RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size;
3900   }
3901 }
3902 
3903 void MacroAssembler::pop_set(RegSet set, int offset) {
3904 
3905   int gp_reg_size = RegisterImpl::max_slots_per_register * VMRegImpl::stack_slot_size;
3906   int restore_size = set.size() * gp_reg_size;
3907   int aligned_size = align_up(restore_size, StackAlignmentInBytes);
3908 
3909   int restore_offset;
3910   if (offset == -1) {
3911     restore_offset = restore_size - gp_reg_size;
3912   } else {
3913     restore_offset = offset + restore_size - gp_reg_size;
3914   }
3915   for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
3916     movptr(*it, Address(rsp, restore_offset));
3917     restore_offset -= gp_reg_size;
3918   }
3919 
3920   if (offset == -1) {
3921     addptr(rsp, aligned_size);
3922   }
3923 }
3924 
3925 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3926 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3927   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3928   assert((offset_in_bytes & (BytesPerInt - 1)) == 0, "offset must be a multiple of BytesPerInt");
3929   Label done;
3930 
3931   testptr(length_in_bytes, length_in_bytes);
3932   jcc(Assembler::zero, done);
3933 
3934   // Emit single 32bit store to clear leading bytes, if necessary.
3935   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
3936 #ifdef _LP64
3937   if (!is_aligned(offset_in_bytes, BytesPerWord)) {
3938     movl(Address(address, offset_in_bytes), temp);
3939     offset_in_bytes += BytesPerInt;
3940     decrement(length_in_bytes, BytesPerInt);
3941   }
3942   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3943   testptr(length_in_bytes, length_in_bytes);
3944   jcc(Assembler::zero, done);
3945 #endif
3946 
3947   // initialize topmost word, divide index by 2, check if odd and test if zero
3948   // note: for the remaining code to work, index must be a multiple of BytesPerWord
3949 #ifdef ASSERT
3950   {
3951     Label L;
3952     testptr(length_in_bytes, BytesPerWord - 1);
3953     jcc(Assembler::zero, L);
3954     stop("length must be a multiple of BytesPerWord");
3955     bind(L);
3956   }
3957 #endif
3958   Register index = length_in_bytes;
3959   if (UseIncDec) {
3960     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
3961   } else {
3962     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
3963     shrptr(index, 1);
3964   }
3965 #ifndef _LP64
3966   // index could have not been a multiple of 8 (i.e., bit 2 was set)
3967   {
3968     Label even;
3969     // note: if index was a multiple of 8, then it cannot
3970     //       be 0 now otherwise it must have been 0 before
3971     //       => if it is even, we don't need to check for 0 again
3972     jcc(Assembler::carryClear, even);
3973     // clear topmost word (no jump would be needed if conditional assignment worked here)
3974     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3975     // index could be 0 now, must check again
3976     jcc(Assembler::zero, done);
3977     bind(even);
3978   }
3979 #endif // !_LP64
3980   // initialize remaining object fields: index is a multiple of 2 now
3981   {
3982     Label loop;
3983     bind(loop);
3984     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3985     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3986     decrement(index);
3987     jcc(Assembler::notZero, loop);
3988   }
3989 
3990   bind(done);
3991 }
3992 
3993 // Look up the method for a megamorphic invokeinterface call.
3994 // The target method is determined by <intf_klass, itable_index>.
3995 // The receiver klass is in recv_klass.
3996 // On success, the result will be in method_result, and execution falls through.
3997 // On failure, execution transfers to the given label.
3998 void MacroAssembler::lookup_interface_method(Register recv_klass,
3999                                              Register intf_klass,
4000                                              RegisterOrConstant itable_index,
4001                                              Register method_result,
4002                                              Register scan_temp,
4003                                              Label& L_no_such_interface,
4004                                              bool return_method) {
4005   assert_different_registers(recv_klass, intf_klass, scan_temp);
4006   assert_different_registers(method_result, intf_klass, scan_temp);
4007   assert(recv_klass != method_result || !return_method,
4008          "recv_klass can be destroyed when method isn't needed");
4009 
4010   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4011          "caller must use same register for non-constant itable index as for method");
4012 
4013   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4014   int vtable_base = in_bytes(Klass::vtable_start_offset());
4015   int itentry_off = itableMethodEntry::method_offset_in_bytes();
4016   int scan_step   = itableOffsetEntry::size() * wordSize;
4017   int vte_size    = vtableEntry::size_in_bytes();
4018   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4019   assert(vte_size == wordSize, "else adjust times_vte_scale");
4020 
4021   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4022 
4023   // %%% Could store the aligned, prescaled offset in the klassoop.
4024   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4025 
4026   if (return_method) {
4027     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4028     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4029     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4030   }
4031 
4032   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4033   //   if (scan->interface() == intf) {
4034   //     result = (klass + scan->offset() + itable_index);
4035   //   }
4036   // }
4037   Label search, found_method;
4038 
4039   for (int peel = 1; peel >= 0; peel--) {
4040     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4041     cmpptr(intf_klass, method_result);
4042 
4043     if (peel) {
4044       jccb(Assembler::equal, found_method);
4045     } else {
4046       jccb(Assembler::notEqual, search);
4047       // (invert the test to fall through to found_method...)
4048     }
4049 
4050     if (!peel)  break;
4051 
4052     bind(search);
4053 
4054     // Check that the previous entry is non-null.  A null entry means that
4055     // the receiver class doesn't implement the interface, and wasn't the
4056     // same as when the caller was compiled.
4057     testptr(method_result, method_result);
4058     jcc(Assembler::zero, L_no_such_interface);
4059     addptr(scan_temp, scan_step);
4060   }
4061 
4062   bind(found_method);
4063 
4064   if (return_method) {
4065     // Got a hit.
4066     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4067     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4068   }
4069 }
4070 
4071 
4072 // virtual method calling
4073 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4074                                            RegisterOrConstant vtable_index,
4075                                            Register method_result) {
4076   const int base = in_bytes(Klass::vtable_start_offset());
4077   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4078   Address vtable_entry_addr(recv_klass,
4079                             vtable_index, Address::times_ptr,
4080                             base + vtableEntry::method_offset_in_bytes());
4081   movptr(method_result, vtable_entry_addr);
4082 }
4083 
4084 
4085 void MacroAssembler::check_klass_subtype(Register sub_klass,
4086                            Register super_klass,
4087                            Register temp_reg,
4088                            Label& L_success) {
4089   Label L_failure;
4090   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4091   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4092   bind(L_failure);
4093 }
4094 
4095 
4096 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4097                                                    Register super_klass,
4098                                                    Register temp_reg,
4099                                                    Label* L_success,
4100                                                    Label* L_failure,
4101                                                    Label* L_slow_path,
4102                                         RegisterOrConstant super_check_offset) {
4103   assert_different_registers(sub_klass, super_klass, temp_reg);
4104   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4105   if (super_check_offset.is_register()) {
4106     assert_different_registers(sub_klass, super_klass,
4107                                super_check_offset.as_register());
4108   } else if (must_load_sco) {
4109     assert(temp_reg != noreg, "supply either a temp or a register offset");
4110   }
4111 
4112   Label L_fallthrough;
4113   int label_nulls = 0;
4114   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4115   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4116   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4117   assert(label_nulls <= 1, "at most one NULL in the batch");
4118 
4119   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4120   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4121   Address super_check_offset_addr(super_klass, sco_offset);
4122 
4123   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4124   // range of a jccb.  If this routine grows larger, reconsider at
4125   // least some of these.
4126 #define local_jcc(assembler_cond, label)                                \
4127   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4128   else                             jcc( assembler_cond, label) /*omit semi*/
4129 
4130   // Hacked jmp, which may only be used just before L_fallthrough.
4131 #define final_jmp(label)                                                \
4132   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4133   else                            jmp(label)                /*omit semi*/
4134 
4135   // If the pointers are equal, we are done (e.g., String[] elements).
4136   // This self-check enables sharing of secondary supertype arrays among
4137   // non-primary types such as array-of-interface.  Otherwise, each such
4138   // type would need its own customized SSA.
4139   // We move this check to the front of the fast path because many
4140   // type checks are in fact trivially successful in this manner,
4141   // so we get a nicely predicted branch right at the start of the check.
4142   cmpptr(sub_klass, super_klass);
4143   local_jcc(Assembler::equal, *L_success);
4144 
4145   // Check the supertype display:
4146   if (must_load_sco) {
4147     // Positive movl does right thing on LP64.
4148     movl(temp_reg, super_check_offset_addr);
4149     super_check_offset = RegisterOrConstant(temp_reg);
4150   }
4151   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4152   cmpptr(super_klass, super_check_addr); // load displayed supertype
4153 
4154   // This check has worked decisively for primary supers.
4155   // Secondary supers are sought in the super_cache ('super_cache_addr').
4156   // (Secondary supers are interfaces and very deeply nested subtypes.)
4157   // This works in the same check above because of a tricky aliasing
4158   // between the super_cache and the primary super display elements.
4159   // (The 'super_check_addr' can address either, as the case requires.)
4160   // Note that the cache is updated below if it does not help us find
4161   // what we need immediately.
4162   // So if it was a primary super, we can just fail immediately.
4163   // Otherwise, it's the slow path for us (no success at this point).
4164 
4165   if (super_check_offset.is_register()) {
4166     local_jcc(Assembler::equal, *L_success);
4167     cmpl(super_check_offset.as_register(), sc_offset);
4168     if (L_failure == &L_fallthrough) {
4169       local_jcc(Assembler::equal, *L_slow_path);
4170     } else {
4171       local_jcc(Assembler::notEqual, *L_failure);
4172       final_jmp(*L_slow_path);
4173     }
4174   } else if (super_check_offset.as_constant() == sc_offset) {
4175     // Need a slow path; fast failure is impossible.
4176     if (L_slow_path == &L_fallthrough) {
4177       local_jcc(Assembler::equal, *L_success);
4178     } else {
4179       local_jcc(Assembler::notEqual, *L_slow_path);
4180       final_jmp(*L_success);
4181     }
4182   } else {
4183     // No slow path; it's a fast decision.
4184     if (L_failure == &L_fallthrough) {
4185       local_jcc(Assembler::equal, *L_success);
4186     } else {
4187       local_jcc(Assembler::notEqual, *L_failure);
4188       final_jmp(*L_success);
4189     }
4190   }
4191 
4192   bind(L_fallthrough);
4193 
4194 #undef local_jcc
4195 #undef final_jmp
4196 }
4197 
4198 
4199 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4200                                                    Register super_klass,
4201                                                    Register temp_reg,
4202                                                    Register temp2_reg,
4203                                                    Label* L_success,
4204                                                    Label* L_failure,
4205                                                    bool set_cond_codes) {
4206   assert_different_registers(sub_klass, super_klass, temp_reg);
4207   if (temp2_reg != noreg)
4208     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4209 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4210 
4211   Label L_fallthrough;
4212   int label_nulls = 0;
4213   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4214   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4215   assert(label_nulls <= 1, "at most one NULL in the batch");
4216 
4217   // a couple of useful fields in sub_klass:
4218   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4219   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4220   Address secondary_supers_addr(sub_klass, ss_offset);
4221   Address super_cache_addr(     sub_klass, sc_offset);
4222 
4223   // Do a linear scan of the secondary super-klass chain.
4224   // This code is rarely used, so simplicity is a virtue here.
4225   // The repne_scan instruction uses fixed registers, which we must spill.
4226   // Don't worry too much about pre-existing connections with the input regs.
4227 
4228   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4229   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4230 
4231   // Get super_klass value into rax (even if it was in rdi or rcx).
4232   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4233   if (super_klass != rax) {
4234     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4235     mov(rax, super_klass);
4236   }
4237   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4238   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4239 
4240 #ifndef PRODUCT
4241   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4242   ExternalAddress pst_counter_addr((address) pst_counter);
4243   NOT_LP64(  incrementl(pst_counter_addr) );
4244   LP64_ONLY( lea(rcx, pst_counter_addr) );
4245   LP64_ONLY( incrementl(Address(rcx, 0)) );
4246 #endif //PRODUCT
4247 
4248   // We will consult the secondary-super array.
4249   movptr(rdi, secondary_supers_addr);
4250   // Load the array length.  (Positive movl does right thing on LP64.)
4251   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4252   // Skip to start of data.
4253   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4254 
4255   // Scan RCX words at [RDI] for an occurrence of RAX.
4256   // Set NZ/Z based on last compare.
4257   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4258   // not change flags (only scas instruction which is repeated sets flags).
4259   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4260 
4261     testptr(rax,rax); // Set Z = 0
4262     repne_scan();
4263 
4264   // Unspill the temp. registers:
4265   if (pushed_rdi)  pop(rdi);
4266   if (pushed_rcx)  pop(rcx);
4267   if (pushed_rax)  pop(rax);
4268 
4269   if (set_cond_codes) {
4270     // Special hack for the AD files:  rdi is guaranteed non-zero.
4271     assert(!pushed_rdi, "rdi must be left non-NULL");
4272     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4273   }
4274 
4275   if (L_failure == &L_fallthrough)
4276         jccb(Assembler::notEqual, *L_failure);
4277   else  jcc(Assembler::notEqual, *L_failure);
4278 
4279   // Success.  Cache the super we found and proceed in triumph.
4280   movptr(super_cache_addr, super_klass);
4281 
4282   if (L_success != &L_fallthrough) {
4283     jmp(*L_success);
4284   }
4285 
4286 #undef IS_A_TEMP
4287 
4288   bind(L_fallthrough);
4289 }
4290 
4291 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4292   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4293 
4294   Label L_fallthrough;
4295   if (L_fast_path == NULL) {
4296     L_fast_path = &L_fallthrough;
4297   } else if (L_slow_path == NULL) {
4298     L_slow_path = &L_fallthrough;
4299   }
4300 
4301   // Fast path check: class is fully initialized
4302   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4303   jcc(Assembler::equal, *L_fast_path);
4304 
4305   // Fast path check: current thread is initializer thread
4306   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4307   if (L_slow_path == &L_fallthrough) {
4308     jcc(Assembler::equal, *L_fast_path);
4309     bind(*L_slow_path);
4310   } else if (L_fast_path == &L_fallthrough) {
4311     jcc(Assembler::notEqual, *L_slow_path);
4312     bind(*L_fast_path);
4313   } else {
4314     Unimplemented();
4315   }
4316 }
4317 
4318 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4319   if (VM_Version::supports_cmov()) {
4320     cmovl(cc, dst, src);
4321   } else {
4322     Label L;
4323     jccb(negate_condition(cc), L);
4324     movl(dst, src);
4325     bind(L);
4326   }
4327 }
4328 
4329 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4330   if (VM_Version::supports_cmov()) {
4331     cmovl(cc, dst, src);
4332   } else {
4333     Label L;
4334     jccb(negate_condition(cc), L);
4335     movl(dst, src);
4336     bind(L);
4337   }
4338 }
4339 
4340 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4341   if (!VerifyOops) return;
4342 
4343   // Pass register number to verify_oop_subroutine
4344   const char* b = NULL;
4345   {
4346     ResourceMark rm;
4347     stringStream ss;
4348     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4349     b = code_string(ss.as_string());
4350   }
4351   BLOCK_COMMENT("verify_oop {");
4352 #ifdef _LP64
4353   push(rscratch1);                    // save r10, trashed by movptr()
4354 #endif
4355   push(rax);                          // save rax,
4356   push(reg);                          // pass register argument
4357   ExternalAddress buffer((address) b);
4358   // avoid using pushptr, as it modifies scratch registers
4359   // and our contract is not to modify anything
4360   movptr(rax, buffer.addr());
4361   push(rax);
4362   // call indirectly to solve generation ordering problem
4363   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4364   call(rax);
4365   // Caller pops the arguments (oop, message) and restores rax, r10
4366   BLOCK_COMMENT("} verify_oop");
4367 }
4368 
4369 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4370   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4371     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4372   } else {
4373     assert(UseAVX > 0, "");
4374     vpcmpeqb(dst, dst, dst, vector_len);
4375   }
4376 }
4377 
4378 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4379                                          int extra_slot_offset) {
4380   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4381   int stackElementSize = Interpreter::stackElementSize;
4382   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4383 #ifdef ASSERT
4384   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4385   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4386 #endif
4387   Register             scale_reg    = noreg;
4388   Address::ScaleFactor scale_factor = Address::no_scale;
4389   if (arg_slot.is_constant()) {
4390     offset += arg_slot.as_constant() * stackElementSize;
4391   } else {
4392     scale_reg    = arg_slot.as_register();
4393     scale_factor = Address::times(stackElementSize);
4394   }
4395   offset += wordSize;           // return PC is on stack
4396   return Address(rsp, scale_reg, scale_factor, offset);
4397 }
4398 
4399 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4400   if (!VerifyOops) return;
4401 
4402   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4403   // Pass register number to verify_oop_subroutine
4404   const char* b = NULL;
4405   {
4406     ResourceMark rm;
4407     stringStream ss;
4408     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4409     b = code_string(ss.as_string());
4410   }
4411 #ifdef _LP64
4412   push(rscratch1);                    // save r10, trashed by movptr()
4413 #endif
4414   push(rax);                          // save rax,
4415   // addr may contain rsp so we will have to adjust it based on the push
4416   // we just did (and on 64 bit we do two pushes)
4417   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4418   // stores rax into addr which is backwards of what was intended.
4419   if (addr.uses(rsp)) {
4420     lea(rax, addr);
4421     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4422   } else {
4423     pushptr(addr);
4424   }
4425 
4426   ExternalAddress buffer((address) b);
4427   // pass msg argument
4428   // avoid using pushptr, as it modifies scratch registers
4429   // and our contract is not to modify anything
4430   movptr(rax, buffer.addr());
4431   push(rax);
4432 
4433   // call indirectly to solve generation ordering problem
4434   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4435   call(rax);
4436   // Caller pops the arguments (addr, message) and restores rax, r10.
4437 }
4438 
4439 void MacroAssembler::verify_tlab() {
4440 #ifdef ASSERT
4441   if (UseTLAB && VerifyOops) {
4442     Label next, ok;
4443     Register t1 = rsi;
4444     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4445 
4446     push(t1);
4447     NOT_LP64(push(thread_reg));
4448     NOT_LP64(get_thread(thread_reg));
4449 
4450     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4451     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4452     jcc(Assembler::aboveEqual, next);
4453     STOP("assert(top >= start)");
4454     should_not_reach_here();
4455 
4456     bind(next);
4457     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4458     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4459     jcc(Assembler::aboveEqual, ok);
4460     STOP("assert(top <= end)");
4461     should_not_reach_here();
4462 
4463     bind(ok);
4464     NOT_LP64(pop(thread_reg));
4465     pop(t1);
4466   }
4467 #endif
4468 }
4469 
4470 class ControlWord {
4471  public:
4472   int32_t _value;
4473 
4474   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4475   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4476   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4477   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4478   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4479   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4480   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4481   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4482 
4483   void print() const {
4484     // rounding control
4485     const char* rc;
4486     switch (rounding_control()) {
4487       case 0: rc = "round near"; break;
4488       case 1: rc = "round down"; break;
4489       case 2: rc = "round up  "; break;
4490       case 3: rc = "chop      "; break;
4491       default:
4492         rc = NULL; // silence compiler warnings
4493         fatal("Unknown rounding control: %d", rounding_control());
4494     };
4495     // precision control
4496     const char* pc;
4497     switch (precision_control()) {
4498       case 0: pc = "24 bits "; break;
4499       case 1: pc = "reserved"; break;
4500       case 2: pc = "53 bits "; break;
4501       case 3: pc = "64 bits "; break;
4502       default:
4503         pc = NULL; // silence compiler warnings
4504         fatal("Unknown precision control: %d", precision_control());
4505     };
4506     // flags
4507     char f[9];
4508     f[0] = ' ';
4509     f[1] = ' ';
4510     f[2] = (precision   ()) ? 'P' : 'p';
4511     f[3] = (underflow   ()) ? 'U' : 'u';
4512     f[4] = (overflow    ()) ? 'O' : 'o';
4513     f[5] = (zero_divide ()) ? 'Z' : 'z';
4514     f[6] = (denormalized()) ? 'D' : 'd';
4515     f[7] = (invalid     ()) ? 'I' : 'i';
4516     f[8] = '\x0';
4517     // output
4518     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4519   }
4520 
4521 };
4522 
4523 class StatusWord {
4524  public:
4525   int32_t _value;
4526 
4527   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4528   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4529   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4530   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4531   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4532   int  top() const                     { return  (_value >> 11) & 7      ; }
4533   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4534   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4535   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4536   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4537   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4538   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4539   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4540   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4541 
4542   void print() const {
4543     // condition codes
4544     char c[5];
4545     c[0] = (C3()) ? '3' : '-';
4546     c[1] = (C2()) ? '2' : '-';
4547     c[2] = (C1()) ? '1' : '-';
4548     c[3] = (C0()) ? '0' : '-';
4549     c[4] = '\x0';
4550     // flags
4551     char f[9];
4552     f[0] = (error_status()) ? 'E' : '-';
4553     f[1] = (stack_fault ()) ? 'S' : '-';
4554     f[2] = (precision   ()) ? 'P' : '-';
4555     f[3] = (underflow   ()) ? 'U' : '-';
4556     f[4] = (overflow    ()) ? 'O' : '-';
4557     f[5] = (zero_divide ()) ? 'Z' : '-';
4558     f[6] = (denormalized()) ? 'D' : '-';
4559     f[7] = (invalid     ()) ? 'I' : '-';
4560     f[8] = '\x0';
4561     // output
4562     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4563   }
4564 
4565 };
4566 
4567 class TagWord {
4568  public:
4569   int32_t _value;
4570 
4571   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4572 
4573   void print() const {
4574     printf("%04x", _value & 0xFFFF);
4575   }
4576 
4577 };
4578 
4579 class FPU_Register {
4580  public:
4581   int32_t _m0;
4582   int32_t _m1;
4583   int16_t _ex;
4584 
4585   bool is_indefinite() const           {
4586     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4587   }
4588 
4589   void print() const {
4590     char  sign = (_ex < 0) ? '-' : '+';
4591     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4592     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4593   };
4594 
4595 };
4596 
4597 class FPU_State {
4598  public:
4599   enum {
4600     register_size       = 10,
4601     number_of_registers =  8,
4602     register_mask       =  7
4603   };
4604 
4605   ControlWord  _control_word;
4606   StatusWord   _status_word;
4607   TagWord      _tag_word;
4608   int32_t      _error_offset;
4609   int32_t      _error_selector;
4610   int32_t      _data_offset;
4611   int32_t      _data_selector;
4612   int8_t       _register[register_size * number_of_registers];
4613 
4614   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4615   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4616 
4617   const char* tag_as_string(int tag) const {
4618     switch (tag) {
4619       case 0: return "valid";
4620       case 1: return "zero";
4621       case 2: return "special";
4622       case 3: return "empty";
4623     }
4624     ShouldNotReachHere();
4625     return NULL;
4626   }
4627 
4628   void print() const {
4629     // print computation registers
4630     { int t = _status_word.top();
4631       for (int i = 0; i < number_of_registers; i++) {
4632         int j = (i - t) & register_mask;
4633         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4634         st(j)->print();
4635         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4636       }
4637     }
4638     printf("\n");
4639     // print control registers
4640     printf("ctrl = "); _control_word.print(); printf("\n");
4641     printf("stat = "); _status_word .print(); printf("\n");
4642     printf("tags = "); _tag_word    .print(); printf("\n");
4643   }
4644 
4645 };
4646 
4647 class Flag_Register {
4648  public:
4649   int32_t _value;
4650 
4651   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4652   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4653   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4654   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4655   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4656   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4657   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4658 
4659   void print() const {
4660     // flags
4661     char f[8];
4662     f[0] = (overflow       ()) ? 'O' : '-';
4663     f[1] = (direction      ()) ? 'D' : '-';
4664     f[2] = (sign           ()) ? 'S' : '-';
4665     f[3] = (zero           ()) ? 'Z' : '-';
4666     f[4] = (auxiliary_carry()) ? 'A' : '-';
4667     f[5] = (parity         ()) ? 'P' : '-';
4668     f[6] = (carry          ()) ? 'C' : '-';
4669     f[7] = '\x0';
4670     // output
4671     printf("%08x  flags = %s", _value, f);
4672   }
4673 
4674 };
4675 
4676 class IU_Register {
4677  public:
4678   int32_t _value;
4679 
4680   void print() const {
4681     printf("%08x  %11d", _value, _value);
4682   }
4683 
4684 };
4685 
4686 class IU_State {
4687  public:
4688   Flag_Register _eflags;
4689   IU_Register   _rdi;
4690   IU_Register   _rsi;
4691   IU_Register   _rbp;
4692   IU_Register   _rsp;
4693   IU_Register   _rbx;
4694   IU_Register   _rdx;
4695   IU_Register   _rcx;
4696   IU_Register   _rax;
4697 
4698   void print() const {
4699     // computation registers
4700     printf("rax,  = "); _rax.print(); printf("\n");
4701     printf("rbx,  = "); _rbx.print(); printf("\n");
4702     printf("rcx  = "); _rcx.print(); printf("\n");
4703     printf("rdx  = "); _rdx.print(); printf("\n");
4704     printf("rdi  = "); _rdi.print(); printf("\n");
4705     printf("rsi  = "); _rsi.print(); printf("\n");
4706     printf("rbp,  = "); _rbp.print(); printf("\n");
4707     printf("rsp  = "); _rsp.print(); printf("\n");
4708     printf("\n");
4709     // control registers
4710     printf("flgs = "); _eflags.print(); printf("\n");
4711   }
4712 };
4713 
4714 
4715 class CPU_State {
4716  public:
4717   FPU_State _fpu_state;
4718   IU_State  _iu_state;
4719 
4720   void print() const {
4721     printf("--------------------------------------------------\n");
4722     _iu_state .print();
4723     printf("\n");
4724     _fpu_state.print();
4725     printf("--------------------------------------------------\n");
4726   }
4727 
4728 };
4729 
4730 
4731 static void _print_CPU_state(CPU_State* state) {
4732   state->print();
4733 };
4734 
4735 
4736 void MacroAssembler::print_CPU_state() {
4737   push_CPU_state();
4738   push(rsp);                // pass CPU state
4739   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4740   addptr(rsp, wordSize);       // discard argument
4741   pop_CPU_state();
4742 }
4743 
4744 
4745 #ifndef _LP64
4746 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4747   static int counter = 0;
4748   FPU_State* fs = &state->_fpu_state;
4749   counter++;
4750   // For leaf calls, only verify that the top few elements remain empty.
4751   // We only need 1 empty at the top for C2 code.
4752   if( stack_depth < 0 ) {
4753     if( fs->tag_for_st(7) != 3 ) {
4754       printf("FPR7 not empty\n");
4755       state->print();
4756       assert(false, "error");
4757       return false;
4758     }
4759     return true;                // All other stack states do not matter
4760   }
4761 
4762   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4763          "bad FPU control word");
4764 
4765   // compute stack depth
4766   int i = 0;
4767   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4768   int d = i;
4769   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4770   // verify findings
4771   if (i != FPU_State::number_of_registers) {
4772     // stack not contiguous
4773     printf("%s: stack not contiguous at ST%d\n", s, i);
4774     state->print();
4775     assert(false, "error");
4776     return false;
4777   }
4778   // check if computed stack depth corresponds to expected stack depth
4779   if (stack_depth < 0) {
4780     // expected stack depth is -stack_depth or less
4781     if (d > -stack_depth) {
4782       // too many elements on the stack
4783       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4784       state->print();
4785       assert(false, "error");
4786       return false;
4787     }
4788   } else {
4789     // expected stack depth is stack_depth
4790     if (d != stack_depth) {
4791       // wrong stack depth
4792       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4793       state->print();
4794       assert(false, "error");
4795       return false;
4796     }
4797   }
4798   // everything is cool
4799   return true;
4800 }
4801 
4802 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4803   if (!VerifyFPU) return;
4804   push_CPU_state();
4805   push(rsp);                // pass CPU state
4806   ExternalAddress msg((address) s);
4807   // pass message string s
4808   pushptr(msg.addr());
4809   push(stack_depth);        // pass stack depth
4810   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4811   addptr(rsp, 3 * wordSize);   // discard arguments
4812   // check for error
4813   { Label L;
4814     testl(rax, rax);
4815     jcc(Assembler::notZero, L);
4816     int3();                  // break if error condition
4817     bind(L);
4818   }
4819   pop_CPU_state();
4820 }
4821 #endif // _LP64
4822 
4823 void MacroAssembler::restore_cpu_control_state_after_jni() {
4824   // Either restore the MXCSR register after returning from the JNI Call
4825   // or verify that it wasn't changed (with -Xcheck:jni flag).
4826   if (VM_Version::supports_sse()) {
4827     if (RestoreMXCSROnJNICalls) {
4828       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4829     } else if (CheckJNICalls) {
4830       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4831     }
4832   }
4833   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4834   vzeroupper();
4835 
4836 #ifndef _LP64
4837   // Either restore the x87 floating pointer control word after returning
4838   // from the JNI call or verify that it wasn't changed.
4839   if (CheckJNICalls) {
4840     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4841   }
4842 #endif // _LP64
4843 }
4844 
4845 // ((OopHandle)result).resolve();
4846 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4847   assert_different_registers(result, tmp);
4848 
4849   // Only 64 bit platforms support GCs that require a tmp register
4850   // Only IN_HEAP loads require a thread_tmp register
4851   // OopHandle::resolve is an indirection like jobject.
4852   access_load_at(T_OBJECT, IN_NATIVE,
4853                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4854 }
4855 
4856 // ((WeakHandle)result).resolve();
4857 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4858   assert_different_registers(rresult, rtmp);
4859   Label resolved;
4860 
4861   // A null weak handle resolves to null.
4862   cmpptr(rresult, 0);
4863   jcc(Assembler::equal, resolved);
4864 
4865   // Only 64 bit platforms support GCs that require a tmp register
4866   // Only IN_HEAP loads require a thread_tmp register
4867   // WeakHandle::resolve is an indirection like jweak.
4868   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4869                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4870   bind(resolved);
4871 }
4872 
4873 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4874   // get mirror
4875   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4876   load_method_holder(mirror, method);
4877   movptr(mirror, Address(mirror, mirror_offset));
4878   resolve_oop_handle(mirror, tmp);
4879 }
4880 
4881 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4882   load_method_holder(rresult, rmethod);
4883   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4884 }
4885 
4886 void MacroAssembler::load_method_holder(Register holder, Register method) {
4887   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4888   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4889   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4890 }
4891 
4892 #ifdef _LP64
4893 void MacroAssembler::load_nklass(Register dst, Register src) {
4894   assert_different_registers(src, dst);
4895   assert(UseCompressedClassPointers, "expect compressed class pointers");
4896 
4897   Label slow, done;
4898   movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
4899   // NOTE: While it would seem nice to use xorb instead (for which we don't have an encoding in our assembler),
4900   // the encoding for xorq uses the signed version (0x81/6) of xor, which encodes as compact as xorb would,
4901   // and does't make a difference performance-wise.
4902   xorq(dst, markWord::unlocked_value);
4903   testb(dst, markWord::lock_mask_in_place);
4904   jccb(Assembler::notZero, slow);
4905 
4906   shrq(dst, markWord::klass_shift);
4907   jmp(done);
4908   bind(slow);
4909 
4910   if (dst != rax) {
4911     push(rax);
4912   }
4913   if (src != rax) {
4914     mov(rax, src);
4915   }
4916   call(RuntimeAddress(StubRoutines::load_nklass()));
4917   if (dst != rax) {
4918     mov(dst, rax);
4919     pop(rax);
4920   }
4921 
4922   bind(done);
4923 }
4924 #endif
4925 
4926 void MacroAssembler::load_klass(Register dst, Register src, Register tmp, bool null_check_src) {
4927   assert_different_registers(src, tmp);
4928   assert_different_registers(dst, tmp);
4929 #ifdef _LP64
4930   assert(UseCompressedClassPointers, "expect compressed class pointers");
4931   Register d = dst;
4932   if (src == dst) {
4933     d = tmp;
4934   }
4935   if (null_check_src) {
4936     null_check(src, oopDesc::mark_offset_in_bytes());
4937   }
4938   load_nklass(d, src);
4939   if (src == dst) {
4940     mov(dst, d);
4941   }
4942   decode_klass_not_null(dst, tmp);
4943 #else
4944   if (null_check_src) {
4945     null_check(src, oopDesc::klass_offset_in_bytes());
4946   }
4947   movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4948 #endif
4949 }
4950 
4951 #ifndef _LP64
4952 void MacroAssembler::store_klass(Register dst, Register src) {
4953   movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4954 }
4955 #endif
4956 
4957 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4958                                     Register tmp1, Register thread_tmp) {
4959   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4960   decorators = AccessInternal::decorator_fixup(decorators);
4961   bool as_raw = (decorators & AS_RAW) != 0;
4962   if (as_raw) {
4963     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4964   } else {
4965     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4966   }
4967 }
4968 
4969 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4970                                      Register tmp1, Register tmp2, Register tmp3) {
4971   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4972   decorators = AccessInternal::decorator_fixup(decorators);
4973   bool as_raw = (decorators & AS_RAW) != 0;
4974   if (as_raw) {
4975     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
4976   } else {
4977     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
4978   }
4979 }
4980 
4981 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4982                                    Register thread_tmp, DecoratorSet decorators) {
4983   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4984 }
4985 
4986 // Doesn't do verification, generates fixed size code
4987 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4988                                             Register thread_tmp, DecoratorSet decorators) {
4989   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4990 }
4991 
4992 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4993                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
4994   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2, tmp3);
4995 }
4996 
4997 // Used for storing NULLs.
4998 void MacroAssembler::store_heap_oop_null(Address dst) {
4999   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5000 }
5001 
5002 #ifdef _LP64
5003 #ifdef ASSERT
5004 void MacroAssembler::verify_heapbase(const char* msg) {
5005   assert (UseCompressedOops, "should be compressed");
5006   assert (Universe::heap() != NULL, "java heap should be initialized");
5007   if (CheckCompressedOops) {
5008     Label ok;
5009     const auto src2 = ExternalAddress((address)CompressedOops::ptrs_base_addr());
5010     assert(!src2.is_lval(), "should not be lval");
5011     const bool is_src2_reachable = reachable(src2);
5012     if (!is_src2_reachable) {
5013       push(rscratch1);  // cmpptr trashes rscratch1
5014     }
5015     cmpptr(r12_heapbase, src2);
5016     jcc(Assembler::equal, ok);
5017     STOP(msg);
5018     bind(ok);
5019     if (!is_src2_reachable) {
5020       pop(rscratch1);
5021     }
5022   }
5023 }
5024 #endif
5025 
5026 // Algorithm must match oop.inline.hpp encode_heap_oop.
5027 void MacroAssembler::encode_heap_oop(Register r) {
5028 #ifdef ASSERT
5029   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5030 #endif
5031   verify_oop_msg(r, "broken oop in encode_heap_oop");
5032   if (CompressedOops::base() == NULL) {
5033     if (CompressedOops::shift() != 0) {
5034       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5035       shrq(r, LogMinObjAlignmentInBytes);
5036     }
5037     return;
5038   }
5039   testq(r, r);
5040   cmovq(Assembler::equal, r, r12_heapbase);
5041   subq(r, r12_heapbase);
5042   shrq(r, LogMinObjAlignmentInBytes);
5043 }
5044 
5045 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5046 #ifdef ASSERT
5047   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5048   if (CheckCompressedOops) {
5049     Label ok;
5050     testq(r, r);
5051     jcc(Assembler::notEqual, ok);
5052     STOP("null oop passed to encode_heap_oop_not_null");
5053     bind(ok);
5054   }
5055 #endif
5056   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5057   if (CompressedOops::base() != NULL) {
5058     subq(r, r12_heapbase);
5059   }
5060   if (CompressedOops::shift() != 0) {
5061     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5062     shrq(r, LogMinObjAlignmentInBytes);
5063   }
5064 }
5065 
5066 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5067 #ifdef ASSERT
5068   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5069   if (CheckCompressedOops) {
5070     Label ok;
5071     testq(src, src);
5072     jcc(Assembler::notEqual, ok);
5073     STOP("null oop passed to encode_heap_oop_not_null2");
5074     bind(ok);
5075   }
5076 #endif
5077   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5078   if (dst != src) {
5079     movq(dst, src);
5080   }
5081   if (CompressedOops::base() != NULL) {
5082     subq(dst, r12_heapbase);
5083   }
5084   if (CompressedOops::shift() != 0) {
5085     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5086     shrq(dst, LogMinObjAlignmentInBytes);
5087   }
5088 }
5089 
5090 void  MacroAssembler::decode_heap_oop(Register r) {
5091 #ifdef ASSERT
5092   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5093 #endif
5094   if (CompressedOops::base() == NULL) {
5095     if (CompressedOops::shift() != 0) {
5096       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5097       shlq(r, LogMinObjAlignmentInBytes);
5098     }
5099   } else {
5100     Label done;
5101     shlq(r, LogMinObjAlignmentInBytes);
5102     jccb(Assembler::equal, done);
5103     addq(r, r12_heapbase);
5104     bind(done);
5105   }
5106   verify_oop_msg(r, "broken oop in decode_heap_oop");
5107 }
5108 
5109 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5110   // Note: it will change flags
5111   assert (UseCompressedOops, "should only be used for compressed headers");
5112   assert (Universe::heap() != NULL, "java heap should be initialized");
5113   // Cannot assert, unverified entry point counts instructions (see .ad file)
5114   // vtableStubs also counts instructions in pd_code_size_limit.
5115   // Also do not verify_oop as this is called by verify_oop.
5116   if (CompressedOops::shift() != 0) {
5117     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5118     shlq(r, LogMinObjAlignmentInBytes);
5119     if (CompressedOops::base() != NULL) {
5120       addq(r, r12_heapbase);
5121     }
5122   } else {
5123     assert (CompressedOops::base() == NULL, "sanity");
5124   }
5125 }
5126 
5127 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5128   // Note: it will change flags
5129   assert (UseCompressedOops, "should only be used for compressed headers");
5130   assert (Universe::heap() != NULL, "java heap should be initialized");
5131   // Cannot assert, unverified entry point counts instructions (see .ad file)
5132   // vtableStubs also counts instructions in pd_code_size_limit.
5133   // Also do not verify_oop as this is called by verify_oop.
5134   if (CompressedOops::shift() != 0) {
5135     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5136     if (LogMinObjAlignmentInBytes == Address::times_8) {
5137       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5138     } else {
5139       if (dst != src) {
5140         movq(dst, src);
5141       }
5142       shlq(dst, LogMinObjAlignmentInBytes);
5143       if (CompressedOops::base() != NULL) {
5144         addq(dst, r12_heapbase);
5145       }
5146     }
5147   } else {
5148     assert (CompressedOops::base() == NULL, "sanity");
5149     if (dst != src) {
5150       movq(dst, src);
5151     }
5152   }
5153 }
5154 
5155 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode = KlassDecodeNone;
5156 
5157 // Returns a static string
5158 const char* MacroAssembler::describe_klass_decode_mode(MacroAssembler::KlassDecodeMode mode) {
5159   switch (mode) {
5160   case KlassDecodeNone: return "none";
5161   case KlassDecodeZero: return "zero";
5162   case KlassDecodeXor:  return "xor";
5163   case KlassDecodeAdd:  return "add";
5164   default:
5165     ShouldNotReachHere();
5166   }
5167   return NULL;
5168 }
5169 
5170 // Return the current narrow Klass pointer decode mode.
5171 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
5172   if (_klass_decode_mode == KlassDecodeNone) {
5173     // First time initialization
5174     assert(UseCompressedClassPointers, "not using compressed class pointers");
5175     assert(Metaspace::initialized(), "metaspace not initialized yet");
5176 
5177     _klass_decode_mode = klass_decode_mode_for_base(CompressedKlassPointers::base());
5178     guarantee(_klass_decode_mode != KlassDecodeNone,
5179               PTR_FORMAT " is not a valid encoding base on aarch64",
5180               p2i(CompressedKlassPointers::base()));
5181     log_info(metaspace)("klass decode mode initialized: %s", describe_klass_decode_mode(_klass_decode_mode));
5182   }
5183   return _klass_decode_mode;
5184 }
5185 
5186 // Given an arbitrary base address, return the KlassDecodeMode that would be used. Return KlassDecodeNone
5187 // if base address is not valid for encoding.
5188 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode_for_base(address base) {
5189   assert(CompressedKlassPointers::shift() != 0, "not lilliput?");
5190 
5191   const uint64_t base_u64 = (uint64_t) base;
5192 
5193   if (base_u64 == 0) {
5194     return KlassDecodeZero;
5195   }
5196 
5197   if ((base_u64 & (KlassEncodingMetaspaceMax - 1)) == 0) {
5198     return KlassDecodeXor;
5199   }
5200 
5201   // Note that there is no point in optimizing for shift=3 since lilliput
5202   // will use larger shifts
5203 
5204   // The add+shift mode for decode_and_move_klass_not_null() requires the base to be
5205   //  shiftable-without-loss. So, this is the minimum restriction on x64 for a valid
5206   //  encoding base. This does not matter in reality since the shift values we use for
5207   //  Lilliput, while large, won't be larger than a page size. And the encoding base
5208   //  will be quite likely page aligned since it usually falls to the beginning of
5209   //  either CDS or CCS.
5210   if ((base_u64 & (KlassAlignmentInBytes - 1)) == 0) {
5211     return KlassDecodeAdd;
5212   }
5213 
5214   return KlassDecodeNone;
5215 }
5216 
5217 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5218   assert_different_registers(r, tmp);
5219   switch (klass_decode_mode()) {
5220   case KlassDecodeZero: {
5221     shrq(r, CompressedKlassPointers::shift());
5222     break;
5223   }
5224   case KlassDecodeXor: {
5225     mov64(tmp, (int64_t)CompressedKlassPointers::base());
5226     xorq(r, tmp);
5227     shrq(r, CompressedKlassPointers::shift());
5228     break;
5229   }
5230   case KlassDecodeAdd: {
5231     mov64(tmp, (int64_t)CompressedKlassPointers::base());
5232     subq(r, tmp);
5233     shrq(r, CompressedKlassPointers::shift());
5234     break;
5235   }
5236   default:
5237     ShouldNotReachHere();
5238   }
5239 }
5240 
5241 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5242   assert_different_registers(src, dst);
5243   switch (klass_decode_mode()) {
5244   case KlassDecodeZero: {
5245     movptr(dst, src);
5246     shrq(dst, CompressedKlassPointers::shift());
5247     break;
5248   }
5249   case KlassDecodeXor: {
5250     mov64(dst, (int64_t)CompressedKlassPointers::base());
5251     xorq(dst, src);
5252     shrq(dst, CompressedKlassPointers::shift());
5253     break;
5254   }
5255   case KlassDecodeAdd: {
5256     mov64(dst, -(int64_t)CompressedKlassPointers::base());
5257     addq(dst, src);
5258     shrq(dst, CompressedKlassPointers::shift());
5259     break;
5260   }
5261   default:
5262     ShouldNotReachHere();
5263   }
5264 }
5265 
5266 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5267   assert_different_registers(r, tmp);
5268   const uint64_t base_u64 = (uint64_t)CompressedKlassPointers::base();
5269   switch (klass_decode_mode()) {
5270   case KlassDecodeZero: {
5271     shlq(r, CompressedKlassPointers::shift());
5272     break;
5273   }
5274   case KlassDecodeXor: {
5275     assert((base_u64 & (KlassEncodingMetaspaceMax - 1)) == 0,
5276            "base " UINT64_FORMAT_X " invalid for xor mode", base_u64); // should have been handled at VM init.
5277     shlq(r, CompressedKlassPointers::shift());
5278     mov64(tmp, base_u64);
5279     xorq(r, tmp);
5280     break;
5281   }
5282   case KlassDecodeAdd: {
5283     shlq(r, CompressedKlassPointers::shift());
5284     mov64(tmp, base_u64);
5285     addq(r, tmp);
5286     break;
5287   }
5288   default:
5289     ShouldNotReachHere();
5290   }
5291 }
5292 
5293 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5294   assert_different_registers(src, dst);
5295   // Note: Cannot assert, unverified entry point counts instructions (see .ad file)
5296   // vtableStubs also counts instructions in pd_code_size_limit.
5297   // Also do not verify_oop as this is called by verify_oop.
5298 
5299   const uint64_t base_u64 = (uint64_t)CompressedKlassPointers::base();
5300 
5301   switch (klass_decode_mode()) {
5302   case KlassDecodeZero: {
5303     movq(dst, src);
5304     shlq(dst, CompressedKlassPointers::shift());
5305     break;
5306   }
5307   case KlassDecodeXor: {
5308     assert((base_u64 & (KlassEncodingMetaspaceMax - 1)) == 0,
5309            "base " UINT64_FORMAT_X " invalid for xor mode", base_u64); // should have been handled at VM init.
5310     const uint64_t base_right_shifted = base_u64 >> CompressedKlassPointers::shift();
5311     mov64(dst, base_right_shifted);
5312     xorq(dst, src);
5313     shlq(dst, CompressedKlassPointers::shift());
5314     break;
5315   }
5316   case KlassDecodeAdd: {
5317     assert((base_u64 & (KlassAlignmentInBytes - 1)) == 0,
5318            "base " UINT64_FORMAT_X " invalid for add mode", base_u64); // should have been handled at VM init.
5319     const uint64_t base_right_shifted = base_u64 >> CompressedKlassPointers::shift();
5320     mov64(dst, base_right_shifted);
5321     addq(dst, src);
5322     shlq(dst, CompressedKlassPointers::shift());
5323     break;
5324   }
5325   default:
5326     ShouldNotReachHere();
5327   }
5328 }
5329 
5330 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5331   assert (UseCompressedOops, "should only be used for compressed headers");
5332   assert (Universe::heap() != NULL, "java heap should be initialized");
5333   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5334   int oop_index = oop_recorder()->find_index(obj);
5335   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5336   mov_narrow_oop(dst, oop_index, rspec);
5337 }
5338 
5339 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5340   assert (UseCompressedOops, "should only be used for compressed headers");
5341   assert (Universe::heap() != NULL, "java heap should be initialized");
5342   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5343   int oop_index = oop_recorder()->find_index(obj);
5344   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5345   mov_narrow_oop(dst, oop_index, rspec);
5346 }
5347 
5348 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5349   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5350   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5351   int klass_index = oop_recorder()->find_index(k);
5352   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5353   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5354 }
5355 
5356 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5357   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5358   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5359   int klass_index = oop_recorder()->find_index(k);
5360   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5361   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5362 }
5363 
5364 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5365   assert (UseCompressedOops, "should only be used for compressed headers");
5366   assert (Universe::heap() != NULL, "java heap should be initialized");
5367   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5368   int oop_index = oop_recorder()->find_index(obj);
5369   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5370   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5371 }
5372 
5373 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5374   assert (UseCompressedOops, "should only be used for compressed headers");
5375   assert (Universe::heap() != NULL, "java heap should be initialized");
5376   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5377   int oop_index = oop_recorder()->find_index(obj);
5378   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5379   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5380 }
5381 
5382 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5383   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5384   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5385   int klass_index = oop_recorder()->find_index(k);
5386   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5387   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5388 }
5389 
5390 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5391   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5392   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5393   int klass_index = oop_recorder()->find_index(k);
5394   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5395   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5396 }
5397 
5398 void MacroAssembler::reinit_heapbase() {
5399   if (UseCompressedOops) {
5400     if (Universe::heap() != NULL) {
5401       if (CompressedOops::base() == NULL) {
5402         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5403       } else {
5404         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5405       }
5406     } else {
5407       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5408     }
5409   }
5410 }
5411 
5412 #endif // _LP64
5413 
5414 #if COMPILER2_OR_JVMCI
5415 
5416 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5417 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5418   // cnt - number of qwords (8-byte words).
5419   // base - start address, qword aligned.
5420   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5421   bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
5422   if (use64byteVector) {
5423     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5424   } else if (MaxVectorSize >= 32) {
5425     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5426   } else {
5427     pxor(xtmp, xtmp);
5428   }
5429   jmp(L_zero_64_bytes);
5430 
5431   BIND(L_loop);
5432   if (MaxVectorSize >= 32) {
5433     fill64(base, 0, xtmp, use64byteVector);
5434   } else {
5435     movdqu(Address(base,  0), xtmp);
5436     movdqu(Address(base, 16), xtmp);
5437     movdqu(Address(base, 32), xtmp);
5438     movdqu(Address(base, 48), xtmp);
5439   }
5440   addptr(base, 64);
5441 
5442   BIND(L_zero_64_bytes);
5443   subptr(cnt, 8);
5444   jccb(Assembler::greaterEqual, L_loop);
5445 
5446   // Copy trailing 64 bytes
5447   if (use64byteVector) {
5448     addptr(cnt, 8);
5449     jccb(Assembler::equal, L_end);
5450     fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
5451     jmp(L_end);
5452   } else {
5453     addptr(cnt, 4);
5454     jccb(Assembler::less, L_tail);
5455     if (MaxVectorSize >= 32) {
5456       vmovdqu(Address(base, 0), xtmp);
5457     } else {
5458       movdqu(Address(base,  0), xtmp);
5459       movdqu(Address(base, 16), xtmp);
5460     }
5461   }
5462   addptr(base, 32);
5463   subptr(cnt, 4);
5464 
5465   BIND(L_tail);
5466   addptr(cnt, 4);
5467   jccb(Assembler::lessEqual, L_end);
5468   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5469     fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
5470   } else {
5471     decrement(cnt);
5472 
5473     BIND(L_sloop);
5474     movq(Address(base, 0), xtmp);
5475     addptr(base, 8);
5476     decrement(cnt);
5477     jccb(Assembler::greaterEqual, L_sloop);
5478   }
5479   BIND(L_end);
5480 }
5481 
5482 // Clearing constant sized memory using YMM/ZMM registers.
5483 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5484   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5485   bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
5486 
5487   int vector64_count = (cnt & (~0x7)) >> 3;
5488   cnt = cnt & 0x7;
5489   const int fill64_per_loop = 4;
5490   const int max_unrolled_fill64 = 8;
5491 
5492   // 64 byte initialization loop.
5493   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5494   int start64 = 0;
5495   if (vector64_count > max_unrolled_fill64) {
5496     Label LOOP;
5497     Register index = rtmp;
5498 
5499     start64 = vector64_count - (vector64_count % fill64_per_loop);
5500 
5501     movl(index, 0);
5502     BIND(LOOP);
5503     for (int i = 0; i < fill64_per_loop; i++) {
5504       fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5505     }
5506     addl(index, fill64_per_loop * 64);
5507     cmpl(index, start64 * 64);
5508     jccb(Assembler::less, LOOP);
5509   }
5510   for (int i = start64; i < vector64_count; i++) {
5511     fill64(base, i * 64, xtmp, use64byteVector);
5512   }
5513 
5514   // Clear remaining 64 byte tail.
5515   int disp = vector64_count * 64;
5516   if (cnt) {
5517     switch (cnt) {
5518       case 1:
5519         movq(Address(base, disp), xtmp);
5520         break;
5521       case 2:
5522         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
5523         break;
5524       case 3:
5525         movl(rtmp, 0x7);
5526         kmovwl(mask, rtmp);
5527         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
5528         break;
5529       case 4:
5530         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5531         break;
5532       case 5:
5533         if (use64byteVector) {
5534           movl(rtmp, 0x1F);
5535           kmovwl(mask, rtmp);
5536           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5537         } else {
5538           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5539           movq(Address(base, disp + 32), xtmp);
5540         }
5541         break;
5542       case 6:
5543         if (use64byteVector) {
5544           movl(rtmp, 0x3F);
5545           kmovwl(mask, rtmp);
5546           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5547         } else {
5548           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5549           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
5550         }
5551         break;
5552       case 7:
5553         if (use64byteVector) {
5554           movl(rtmp, 0x7F);
5555           kmovwl(mask, rtmp);
5556           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
5557         } else {
5558           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
5559           movl(rtmp, 0x7);
5560           kmovwl(mask, rtmp);
5561           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
5562         }
5563         break;
5564       default:
5565         fatal("Unexpected length : %d\n",cnt);
5566         break;
5567     }
5568   }
5569 }
5570 
5571 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5572                                bool is_large, KRegister mask) {
5573   // cnt      - number of qwords (8-byte words).
5574   // base     - start address, qword aligned.
5575   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5576   assert(base==rdi, "base register must be edi for rep stos");
5577   assert(tmp==rax,   "tmp register must be eax for rep stos");
5578   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5579   assert(InitArrayShortSize % BytesPerLong == 0,
5580     "InitArrayShortSize should be the multiple of BytesPerLong");
5581 
5582   Label DONE;
5583   if (!is_large || !UseXMMForObjInit) {
5584     xorptr(tmp, tmp);
5585   }
5586 
5587   if (!is_large) {
5588     Label LOOP, LONG;
5589     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5590     jccb(Assembler::greater, LONG);
5591 
5592     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5593 
5594     decrement(cnt);
5595     jccb(Assembler::negative, DONE); // Zero length
5596 
5597     // Use individual pointer-sized stores for small counts:
5598     BIND(LOOP);
5599     movptr(Address(base, cnt, Address::times_ptr), tmp);
5600     decrement(cnt);
5601     jccb(Assembler::greaterEqual, LOOP);
5602     jmpb(DONE);
5603 
5604     BIND(LONG);
5605   }
5606 
5607   // Use longer rep-prefixed ops for non-small counts:
5608   if (UseFastStosb) {
5609     shlptr(cnt, 3); // convert to number of bytes
5610     rep_stosb();
5611   } else if (UseXMMForObjInit) {
5612     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5613   } else {
5614     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5615     rep_stos();
5616   }
5617 
5618   BIND(DONE);
5619 }
5620 
5621 #endif //COMPILER2_OR_JVMCI
5622 
5623 
5624 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5625                                    Register to, Register value, Register count,
5626                                    Register rtmp, XMMRegister xtmp) {
5627   ShortBranchVerifier sbv(this);
5628   assert_different_registers(to, value, count, rtmp);
5629   Label L_exit;
5630   Label L_fill_2_bytes, L_fill_4_bytes;
5631 
5632 #if defined(COMPILER2) && defined(_LP64)
5633   if(MaxVectorSize >=32 &&
5634      VM_Version::supports_avx512vlbw() &&
5635      VM_Version::supports_bmi2()) {
5636     generate_fill_avx3(t, to, value, count, rtmp, xtmp);
5637     return;
5638   }
5639 #endif
5640 
5641   int shift = -1;
5642   switch (t) {
5643     case T_BYTE:
5644       shift = 2;
5645       break;
5646     case T_SHORT:
5647       shift = 1;
5648       break;
5649     case T_INT:
5650       shift = 0;
5651       break;
5652     default: ShouldNotReachHere();
5653   }
5654 
5655   if (t == T_BYTE) {
5656     andl(value, 0xff);
5657     movl(rtmp, value);
5658     shll(rtmp, 8);
5659     orl(value, rtmp);
5660   }
5661   if (t == T_SHORT) {
5662     andl(value, 0xffff);
5663   }
5664   if (t == T_BYTE || t == T_SHORT) {
5665     movl(rtmp, value);
5666     shll(rtmp, 16);
5667     orl(value, rtmp);
5668   }
5669 
5670   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5671   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5672   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5673     Label L_skip_align2;
5674     // align source address at 4 bytes address boundary
5675     if (t == T_BYTE) {
5676       Label L_skip_align1;
5677       // One byte misalignment happens only for byte arrays
5678       testptr(to, 1);
5679       jccb(Assembler::zero, L_skip_align1);
5680       movb(Address(to, 0), value);
5681       increment(to);
5682       decrement(count);
5683       BIND(L_skip_align1);
5684     }
5685     // Two bytes misalignment happens only for byte and short (char) arrays
5686     testptr(to, 2);
5687     jccb(Assembler::zero, L_skip_align2);
5688     movw(Address(to, 0), value);
5689     addptr(to, 2);
5690     subl(count, 1<<(shift-1));
5691     BIND(L_skip_align2);
5692   }
5693   if (UseSSE < 2) {
5694     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5695     // Fill 32-byte chunks
5696     subl(count, 8 << shift);
5697     jcc(Assembler::less, L_check_fill_8_bytes);
5698     align(16);
5699 
5700     BIND(L_fill_32_bytes_loop);
5701 
5702     for (int i = 0; i < 32; i += 4) {
5703       movl(Address(to, i), value);
5704     }
5705 
5706     addptr(to, 32);
5707     subl(count, 8 << shift);
5708     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5709     BIND(L_check_fill_8_bytes);
5710     addl(count, 8 << shift);
5711     jccb(Assembler::zero, L_exit);
5712     jmpb(L_fill_8_bytes);
5713 
5714     //
5715     // length is too short, just fill qwords
5716     //
5717     BIND(L_fill_8_bytes_loop);
5718     movl(Address(to, 0), value);
5719     movl(Address(to, 4), value);
5720     addptr(to, 8);
5721     BIND(L_fill_8_bytes);
5722     subl(count, 1 << (shift + 1));
5723     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5724     // fall through to fill 4 bytes
5725   } else {
5726     Label L_fill_32_bytes;
5727     if (!UseUnalignedLoadStores) {
5728       // align to 8 bytes, we know we are 4 byte aligned to start
5729       testptr(to, 4);
5730       jccb(Assembler::zero, L_fill_32_bytes);
5731       movl(Address(to, 0), value);
5732       addptr(to, 4);
5733       subl(count, 1<<shift);
5734     }
5735     BIND(L_fill_32_bytes);
5736     {
5737       assert( UseSSE >= 2, "supported cpu only" );
5738       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5739       movdl(xtmp, value);
5740       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5741         Label L_check_fill_32_bytes;
5742         if (UseAVX > 2) {
5743           // Fill 64-byte chunks
5744           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5745 
5746           // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
5747           cmpl(count, VM_Version::avx3_threshold());
5748           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5749 
5750           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5751 
5752           subl(count, 16 << shift);
5753           jccb(Assembler::less, L_check_fill_32_bytes);
5754           align(16);
5755 
5756           BIND(L_fill_64_bytes_loop_avx3);
5757           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5758           addptr(to, 64);
5759           subl(count, 16 << shift);
5760           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5761           jmpb(L_check_fill_32_bytes);
5762 
5763           BIND(L_check_fill_64_bytes_avx2);
5764         }
5765         // Fill 64-byte chunks
5766         Label L_fill_64_bytes_loop;
5767         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5768 
5769         subl(count, 16 << shift);
5770         jcc(Assembler::less, L_check_fill_32_bytes);
5771         align(16);
5772 
5773         BIND(L_fill_64_bytes_loop);
5774         vmovdqu(Address(to, 0), xtmp);
5775         vmovdqu(Address(to, 32), xtmp);
5776         addptr(to, 64);
5777         subl(count, 16 << shift);
5778         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5779 
5780         BIND(L_check_fill_32_bytes);
5781         addl(count, 8 << shift);
5782         jccb(Assembler::less, L_check_fill_8_bytes);
5783         vmovdqu(Address(to, 0), xtmp);
5784         addptr(to, 32);
5785         subl(count, 8 << shift);
5786 
5787         BIND(L_check_fill_8_bytes);
5788         // clean upper bits of YMM registers
5789         movdl(xtmp, value);
5790         pshufd(xtmp, xtmp, 0);
5791       } else {
5792         // Fill 32-byte chunks
5793         pshufd(xtmp, xtmp, 0);
5794 
5795         subl(count, 8 << shift);
5796         jcc(Assembler::less, L_check_fill_8_bytes);
5797         align(16);
5798 
5799         BIND(L_fill_32_bytes_loop);
5800 
5801         if (UseUnalignedLoadStores) {
5802           movdqu(Address(to, 0), xtmp);
5803           movdqu(Address(to, 16), xtmp);
5804         } else {
5805           movq(Address(to, 0), xtmp);
5806           movq(Address(to, 8), xtmp);
5807           movq(Address(to, 16), xtmp);
5808           movq(Address(to, 24), xtmp);
5809         }
5810 
5811         addptr(to, 32);
5812         subl(count, 8 << shift);
5813         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5814 
5815         BIND(L_check_fill_8_bytes);
5816       }
5817       addl(count, 8 << shift);
5818       jccb(Assembler::zero, L_exit);
5819       jmpb(L_fill_8_bytes);
5820 
5821       //
5822       // length is too short, just fill qwords
5823       //
5824       BIND(L_fill_8_bytes_loop);
5825       movq(Address(to, 0), xtmp);
5826       addptr(to, 8);
5827       BIND(L_fill_8_bytes);
5828       subl(count, 1 << (shift + 1));
5829       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5830     }
5831   }
5832   // fill trailing 4 bytes
5833   BIND(L_fill_4_bytes);
5834   testl(count, 1<<shift);
5835   jccb(Assembler::zero, L_fill_2_bytes);
5836   movl(Address(to, 0), value);
5837   if (t == T_BYTE || t == T_SHORT) {
5838     Label L_fill_byte;
5839     addptr(to, 4);
5840     BIND(L_fill_2_bytes);
5841     // fill trailing 2 bytes
5842     testl(count, 1<<(shift-1));
5843     jccb(Assembler::zero, L_fill_byte);
5844     movw(Address(to, 0), value);
5845     if (t == T_BYTE) {
5846       addptr(to, 2);
5847       BIND(L_fill_byte);
5848       // fill trailing byte
5849       testl(count, 1);
5850       jccb(Assembler::zero, L_exit);
5851       movb(Address(to, 0), value);
5852     } else {
5853       BIND(L_fill_byte);
5854     }
5855   } else {
5856     BIND(L_fill_2_bytes);
5857   }
5858   BIND(L_exit);
5859 }
5860 
5861 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
5862   switch(type) {
5863     case T_BYTE:
5864     case T_BOOLEAN:
5865       evpbroadcastb(dst, src, vector_len);
5866       break;
5867     case T_SHORT:
5868     case T_CHAR:
5869       evpbroadcastw(dst, src, vector_len);
5870       break;
5871     case T_INT:
5872     case T_FLOAT:
5873       evpbroadcastd(dst, src, vector_len);
5874       break;
5875     case T_LONG:
5876     case T_DOUBLE:
5877       evpbroadcastq(dst, src, vector_len);
5878       break;
5879     default:
5880       fatal("Unhandled type : %s", type2name(type));
5881       break;
5882   }
5883 }
5884 
5885 // encode char[] to byte[] in ISO_8859_1 or ASCII
5886    //@IntrinsicCandidate
5887    //private static int implEncodeISOArray(byte[] sa, int sp,
5888    //byte[] da, int dp, int len) {
5889    //  int i = 0;
5890    //  for (; i < len; i++) {
5891    //    char c = StringUTF16.getChar(sa, sp++);
5892    //    if (c > '\u00FF')
5893    //      break;
5894    //    da[dp++] = (byte)c;
5895    //  }
5896    //  return i;
5897    //}
5898    //
5899    //@IntrinsicCandidate
5900    //private static int implEncodeAsciiArray(char[] sa, int sp,
5901    //    byte[] da, int dp, int len) {
5902    //  int i = 0;
5903    //  for (; i < len; i++) {
5904    //    char c = sa[sp++];
5905    //    if (c >= '\u0080')
5906    //      break;
5907    //    da[dp++] = (byte)c;
5908    //  }
5909    //  return i;
5910    //}
5911 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5912   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5913   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5914   Register tmp5, Register result, bool ascii) {
5915 
5916   // rsi: src
5917   // rdi: dst
5918   // rdx: len
5919   // rcx: tmp5
5920   // rax: result
5921   ShortBranchVerifier sbv(this);
5922   assert_different_registers(src, dst, len, tmp5, result);
5923   Label L_done, L_copy_1_char, L_copy_1_char_exit;
5924 
5925   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5926   int short_mask = ascii ? 0xff80 : 0xff00;
5927 
5928   // set result
5929   xorl(result, result);
5930   // check for zero length
5931   testl(len, len);
5932   jcc(Assembler::zero, L_done);
5933 
5934   movl(result, len);
5935 
5936   // Setup pointers
5937   lea(src, Address(src, len, Address::times_2)); // char[]
5938   lea(dst, Address(dst, len, Address::times_1)); // byte[]
5939   negptr(len);
5940 
5941   if (UseSSE42Intrinsics || UseAVX >= 2) {
5942     Label L_copy_8_chars, L_copy_8_chars_exit;
5943     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5944 
5945     if (UseAVX >= 2) {
5946       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5947       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5948       movdl(tmp1Reg, tmp5);
5949       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5950       jmp(L_chars_32_check);
5951 
5952       bind(L_copy_32_chars);
5953       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5954       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5955       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5956       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5957       jccb(Assembler::notZero, L_copy_32_chars_exit);
5958       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5959       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5960       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5961 
5962       bind(L_chars_32_check);
5963       addptr(len, 32);
5964       jcc(Assembler::lessEqual, L_copy_32_chars);
5965 
5966       bind(L_copy_32_chars_exit);
5967       subptr(len, 16);
5968       jccb(Assembler::greater, L_copy_16_chars_exit);
5969 
5970     } else if (UseSSE42Intrinsics) {
5971       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5972       movdl(tmp1Reg, tmp5);
5973       pshufd(tmp1Reg, tmp1Reg, 0);
5974       jmpb(L_chars_16_check);
5975     }
5976 
5977     bind(L_copy_16_chars);
5978     if (UseAVX >= 2) {
5979       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5980       vptest(tmp2Reg, tmp1Reg);
5981       jcc(Assembler::notZero, L_copy_16_chars_exit);
5982       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5983       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5984     } else {
5985       if (UseAVX > 0) {
5986         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5987         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5988         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5989       } else {
5990         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5991         por(tmp2Reg, tmp3Reg);
5992         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5993         por(tmp2Reg, tmp4Reg);
5994       }
5995       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5996       jccb(Assembler::notZero, L_copy_16_chars_exit);
5997       packuswb(tmp3Reg, tmp4Reg);
5998     }
5999     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6000 
6001     bind(L_chars_16_check);
6002     addptr(len, 16);
6003     jcc(Assembler::lessEqual, L_copy_16_chars);
6004 
6005     bind(L_copy_16_chars_exit);
6006     if (UseAVX >= 2) {
6007       // clean upper bits of YMM registers
6008       vpxor(tmp2Reg, tmp2Reg);
6009       vpxor(tmp3Reg, tmp3Reg);
6010       vpxor(tmp4Reg, tmp4Reg);
6011       movdl(tmp1Reg, tmp5);
6012       pshufd(tmp1Reg, tmp1Reg, 0);
6013     }
6014     subptr(len, 8);
6015     jccb(Assembler::greater, L_copy_8_chars_exit);
6016 
6017     bind(L_copy_8_chars);
6018     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6019     ptest(tmp3Reg, tmp1Reg);
6020     jccb(Assembler::notZero, L_copy_8_chars_exit);
6021     packuswb(tmp3Reg, tmp1Reg);
6022     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6023     addptr(len, 8);
6024     jccb(Assembler::lessEqual, L_copy_8_chars);
6025 
6026     bind(L_copy_8_chars_exit);
6027     subptr(len, 8);
6028     jccb(Assembler::zero, L_done);
6029   }
6030 
6031   bind(L_copy_1_char);
6032   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6033   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
6034   jccb(Assembler::notZero, L_copy_1_char_exit);
6035   movb(Address(dst, len, Address::times_1, 0), tmp5);
6036   addptr(len, 1);
6037   jccb(Assembler::less, L_copy_1_char);
6038 
6039   bind(L_copy_1_char_exit);
6040   addptr(result, len); // len is negative count of not processed elements
6041 
6042   bind(L_done);
6043 }
6044 
6045 #ifdef _LP64
6046 /**
6047  * Helper for multiply_to_len().
6048  */
6049 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6050   addq(dest_lo, src1);
6051   adcq(dest_hi, 0);
6052   addq(dest_lo, src2);
6053   adcq(dest_hi, 0);
6054 }
6055 
6056 /**
6057  * Multiply 64 bit by 64 bit first loop.
6058  */
6059 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6060                                            Register y, Register y_idx, Register z,
6061                                            Register carry, Register product,
6062                                            Register idx, Register kdx) {
6063   //
6064   //  jlong carry, x[], y[], z[];
6065   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6066   //    huge_128 product = y[idx] * x[xstart] + carry;
6067   //    z[kdx] = (jlong)product;
6068   //    carry  = (jlong)(product >>> 64);
6069   //  }
6070   //  z[xstart] = carry;
6071   //
6072 
6073   Label L_first_loop, L_first_loop_exit;
6074   Label L_one_x, L_one_y, L_multiply;
6075 
6076   decrementl(xstart);
6077   jcc(Assembler::negative, L_one_x);
6078 
6079   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6080   rorq(x_xstart, 32); // convert big-endian to little-endian
6081 
6082   bind(L_first_loop);
6083   decrementl(idx);
6084   jcc(Assembler::negative, L_first_loop_exit);
6085   decrementl(idx);
6086   jcc(Assembler::negative, L_one_y);
6087   movq(y_idx, Address(y, idx, Address::times_4,  0));
6088   rorq(y_idx, 32); // convert big-endian to little-endian
6089   bind(L_multiply);
6090   movq(product, x_xstart);
6091   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
6092   addq(product, carry);
6093   adcq(rdx, 0);
6094   subl(kdx, 2);
6095   movl(Address(z, kdx, Address::times_4,  4), product);
6096   shrq(product, 32);
6097   movl(Address(z, kdx, Address::times_4,  0), product);
6098   movq(carry, rdx);
6099   jmp(L_first_loop);
6100 
6101   bind(L_one_y);
6102   movl(y_idx, Address(y,  0));
6103   jmp(L_multiply);
6104 
6105   bind(L_one_x);
6106   movl(x_xstart, Address(x,  0));
6107   jmp(L_first_loop);
6108 
6109   bind(L_first_loop_exit);
6110 }
6111 
6112 /**
6113  * Multiply 64 bit by 64 bit and add 128 bit.
6114  */
6115 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
6116                                             Register yz_idx, Register idx,
6117                                             Register carry, Register product, int offset) {
6118   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
6119   //     z[kdx] = (jlong)product;
6120 
6121   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
6122   rorq(yz_idx, 32); // convert big-endian to little-endian
6123   movq(product, x_xstart);
6124   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
6125   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
6126   rorq(yz_idx, 32); // convert big-endian to little-endian
6127 
6128   add2_with_carry(rdx, product, carry, yz_idx);
6129 
6130   movl(Address(z, idx, Address::times_4,  offset+4), product);
6131   shrq(product, 32);
6132   movl(Address(z, idx, Address::times_4,  offset), product);
6133 
6134 }
6135 
6136 /**
6137  * Multiply 128 bit by 128 bit. Unrolled inner loop.
6138  */
6139 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
6140                                              Register yz_idx, Register idx, Register jdx,
6141                                              Register carry, Register product,
6142                                              Register carry2) {
6143   //   jlong carry, x[], y[], z[];
6144   //   int kdx = ystart+1;
6145   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6146   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
6147   //     z[kdx+idx+1] = (jlong)product;
6148   //     jlong carry2  = (jlong)(product >>> 64);
6149   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6150   //     z[kdx+idx] = (jlong)product;
6151   //     carry  = (jlong)(product >>> 64);
6152   //   }
6153   //   idx += 2;
6154   //   if (idx > 0) {
6155   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6156   //     z[kdx+idx] = (jlong)product;
6157   //     carry  = (jlong)(product >>> 64);
6158   //   }
6159   //
6160 
6161   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6162 
6163   movl(jdx, idx);
6164   andl(jdx, 0xFFFFFFFC);
6165   shrl(jdx, 2);
6166 
6167   bind(L_third_loop);
6168   subl(jdx, 1);
6169   jcc(Assembler::negative, L_third_loop_exit);
6170   subl(idx, 4);
6171 
6172   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6173   movq(carry2, rdx);
6174 
6175   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6176   movq(carry, rdx);
6177   jmp(L_third_loop);
6178 
6179   bind (L_third_loop_exit);
6180 
6181   andl (idx, 0x3);
6182   jcc(Assembler::zero, L_post_third_loop_done);
6183 
6184   Label L_check_1;
6185   subl(idx, 2);
6186   jcc(Assembler::negative, L_check_1);
6187 
6188   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6189   movq(carry, rdx);
6190 
6191   bind (L_check_1);
6192   addl (idx, 0x2);
6193   andl (idx, 0x1);
6194   subl(idx, 1);
6195   jcc(Assembler::negative, L_post_third_loop_done);
6196 
6197   movl(yz_idx, Address(y, idx, Address::times_4,  0));
6198   movq(product, x_xstart);
6199   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6200   movl(yz_idx, Address(z, idx, Address::times_4,  0));
6201 
6202   add2_with_carry(rdx, product, yz_idx, carry);
6203 
6204   movl(Address(z, idx, Address::times_4,  0), product);
6205   shrq(product, 32);
6206 
6207   shlq(rdx, 32);
6208   orq(product, rdx);
6209   movq(carry, product);
6210 
6211   bind(L_post_third_loop_done);
6212 }
6213 
6214 /**
6215  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6216  *
6217  */
6218 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6219                                                   Register carry, Register carry2,
6220                                                   Register idx, Register jdx,
6221                                                   Register yz_idx1, Register yz_idx2,
6222                                                   Register tmp, Register tmp3, Register tmp4) {
6223   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6224 
6225   //   jlong carry, x[], y[], z[];
6226   //   int kdx = ystart+1;
6227   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6228   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6229   //     jlong carry2  = (jlong)(tmp3 >>> 64);
6230   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
6231   //     carry  = (jlong)(tmp4 >>> 64);
6232   //     z[kdx+idx+1] = (jlong)tmp3;
6233   //     z[kdx+idx] = (jlong)tmp4;
6234   //   }
6235   //   idx += 2;
6236   //   if (idx > 0) {
6237   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6238   //     z[kdx+idx] = (jlong)yz_idx1;
6239   //     carry  = (jlong)(yz_idx1 >>> 64);
6240   //   }
6241   //
6242 
6243   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6244 
6245   movl(jdx, idx);
6246   andl(jdx, 0xFFFFFFFC);
6247   shrl(jdx, 2);
6248 
6249   bind(L_third_loop);
6250   subl(jdx, 1);
6251   jcc(Assembler::negative, L_third_loop_exit);
6252   subl(idx, 4);
6253 
6254   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
6255   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6256   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
6257   rorxq(yz_idx2, yz_idx2, 32);
6258 
6259   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
6260   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
6261 
6262   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
6263   rorxq(yz_idx1, yz_idx1, 32);
6264   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6265   rorxq(yz_idx2, yz_idx2, 32);
6266 
6267   if (VM_Version::supports_adx()) {
6268     adcxq(tmp3, carry);
6269     adoxq(tmp3, yz_idx1);
6270 
6271     adcxq(tmp4, tmp);
6272     adoxq(tmp4, yz_idx2);
6273 
6274     movl(carry, 0); // does not affect flags
6275     adcxq(carry2, carry);
6276     adoxq(carry2, carry);
6277   } else {
6278     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6279     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6280   }
6281   movq(carry, carry2);
6282 
6283   movl(Address(z, idx, Address::times_4, 12), tmp3);
6284   shrq(tmp3, 32);
6285   movl(Address(z, idx, Address::times_4,  8), tmp3);
6286 
6287   movl(Address(z, idx, Address::times_4,  4), tmp4);
6288   shrq(tmp4, 32);
6289   movl(Address(z, idx, Address::times_4,  0), tmp4);
6290 
6291   jmp(L_third_loop);
6292 
6293   bind (L_third_loop_exit);
6294 
6295   andl (idx, 0x3);
6296   jcc(Assembler::zero, L_post_third_loop_done);
6297 
6298   Label L_check_1;
6299   subl(idx, 2);
6300   jcc(Assembler::negative, L_check_1);
6301 
6302   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
6303   rorxq(yz_idx1, yz_idx1, 32);
6304   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
6305   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6306   rorxq(yz_idx2, yz_idx2, 32);
6307 
6308   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6309 
6310   movl(Address(z, idx, Address::times_4,  4), tmp3);
6311   shrq(tmp3, 32);
6312   movl(Address(z, idx, Address::times_4,  0), tmp3);
6313   movq(carry, tmp4);
6314 
6315   bind (L_check_1);
6316   addl (idx, 0x2);
6317   andl (idx, 0x1);
6318   subl(idx, 1);
6319   jcc(Assembler::negative, L_post_third_loop_done);
6320   movl(tmp4, Address(y, idx, Address::times_4,  0));
6321   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
6322   movl(tmp4, Address(z, idx, Address::times_4,  0));
6323 
6324   add2_with_carry(carry2, tmp3, tmp4, carry);
6325 
6326   movl(Address(z, idx, Address::times_4,  0), tmp3);
6327   shrq(tmp3, 32);
6328 
6329   shlq(carry2, 32);
6330   orq(tmp3, carry2);
6331   movq(carry, tmp3);
6332 
6333   bind(L_post_third_loop_done);
6334 }
6335 
6336 /**
6337  * Code for BigInteger::multiplyToLen() intrinsic.
6338  *
6339  * rdi: x
6340  * rax: xlen
6341  * rsi: y
6342  * rcx: ylen
6343  * r8:  z
6344  * r11: zlen
6345  * r12: tmp1
6346  * r13: tmp2
6347  * r14: tmp3
6348  * r15: tmp4
6349  * rbx: tmp5
6350  *
6351  */
6352 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6353                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6354   ShortBranchVerifier sbv(this);
6355   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6356 
6357   push(tmp1);
6358   push(tmp2);
6359   push(tmp3);
6360   push(tmp4);
6361   push(tmp5);
6362 
6363   push(xlen);
6364   push(zlen);
6365 
6366   const Register idx = tmp1;
6367   const Register kdx = tmp2;
6368   const Register xstart = tmp3;
6369 
6370   const Register y_idx = tmp4;
6371   const Register carry = tmp5;
6372   const Register product  = xlen;
6373   const Register x_xstart = zlen;  // reuse register
6374 
6375   // First Loop.
6376   //
6377   //  final static long LONG_MASK = 0xffffffffL;
6378   //  int xstart = xlen - 1;
6379   //  int ystart = ylen - 1;
6380   //  long carry = 0;
6381   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6382   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6383   //    z[kdx] = (int)product;
6384   //    carry = product >>> 32;
6385   //  }
6386   //  z[xstart] = (int)carry;
6387   //
6388 
6389   movl(idx, ylen);      // idx = ylen;
6390   movl(kdx, zlen);      // kdx = xlen+ylen;
6391   xorq(carry, carry);   // carry = 0;
6392 
6393   Label L_done;
6394 
6395   movl(xstart, xlen);
6396   decrementl(xstart);
6397   jcc(Assembler::negative, L_done);
6398 
6399   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6400 
6401   Label L_second_loop;
6402   testl(kdx, kdx);
6403   jcc(Assembler::zero, L_second_loop);
6404 
6405   Label L_carry;
6406   subl(kdx, 1);
6407   jcc(Assembler::zero, L_carry);
6408 
6409   movl(Address(z, kdx, Address::times_4,  0), carry);
6410   shrq(carry, 32);
6411   subl(kdx, 1);
6412 
6413   bind(L_carry);
6414   movl(Address(z, kdx, Address::times_4,  0), carry);
6415 
6416   // Second and third (nested) loops.
6417   //
6418   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6419   //   carry = 0;
6420   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6421   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6422   //                    (z[k] & LONG_MASK) + carry;
6423   //     z[k] = (int)product;
6424   //     carry = product >>> 32;
6425   //   }
6426   //   z[i] = (int)carry;
6427   // }
6428   //
6429   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6430 
6431   const Register jdx = tmp1;
6432 
6433   bind(L_second_loop);
6434   xorl(carry, carry);    // carry = 0;
6435   movl(jdx, ylen);       // j = ystart+1
6436 
6437   subl(xstart, 1);       // i = xstart-1;
6438   jcc(Assembler::negative, L_done);
6439 
6440   push (z);
6441 
6442   Label L_last_x;
6443   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6444   subl(xstart, 1);       // i = xstart-1;
6445   jcc(Assembler::negative, L_last_x);
6446 
6447   if (UseBMI2Instructions) {
6448     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6449     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6450   } else {
6451     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6452     rorq(x_xstart, 32);  // convert big-endian to little-endian
6453   }
6454 
6455   Label L_third_loop_prologue;
6456   bind(L_third_loop_prologue);
6457 
6458   push (x);
6459   push (xstart);
6460   push (ylen);
6461 
6462 
6463   if (UseBMI2Instructions) {
6464     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6465   } else { // !UseBMI2Instructions
6466     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6467   }
6468 
6469   pop(ylen);
6470   pop(xlen);
6471   pop(x);
6472   pop(z);
6473 
6474   movl(tmp3, xlen);
6475   addl(tmp3, 1);
6476   movl(Address(z, tmp3, Address::times_4,  0), carry);
6477   subl(tmp3, 1);
6478   jccb(Assembler::negative, L_done);
6479 
6480   shrq(carry, 32);
6481   movl(Address(z, tmp3, Address::times_4,  0), carry);
6482   jmp(L_second_loop);
6483 
6484   // Next infrequent code is moved outside loops.
6485   bind(L_last_x);
6486   if (UseBMI2Instructions) {
6487     movl(rdx, Address(x,  0));
6488   } else {
6489     movl(x_xstart, Address(x,  0));
6490   }
6491   jmp(L_third_loop_prologue);
6492 
6493   bind(L_done);
6494 
6495   pop(zlen);
6496   pop(xlen);
6497 
6498   pop(tmp5);
6499   pop(tmp4);
6500   pop(tmp3);
6501   pop(tmp2);
6502   pop(tmp1);
6503 }
6504 
6505 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6506   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6507   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6508   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6509   Label VECTOR8_TAIL, VECTOR4_TAIL;
6510   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6511   Label SAME_TILL_END, DONE;
6512   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6513 
6514   //scale is in rcx in both Win64 and Unix
6515   ShortBranchVerifier sbv(this);
6516 
6517   shlq(length);
6518   xorq(result, result);
6519 
6520   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6521       VM_Version::supports_avx512vlbw()) {
6522     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6523 
6524     cmpq(length, 64);
6525     jcc(Assembler::less, VECTOR32_TAIL);
6526 
6527     movq(tmp1, length);
6528     andq(tmp1, 0x3F);      // tail count
6529     andq(length, ~(0x3F)); //vector count
6530 
6531     bind(VECTOR64_LOOP);
6532     // AVX512 code to compare 64 byte vectors.
6533     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
6534     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6535     kortestql(k7, k7);
6536     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6537     addq(result, 64);
6538     subq(length, 64);
6539     jccb(Assembler::notZero, VECTOR64_LOOP);
6540 
6541     //bind(VECTOR64_TAIL);
6542     testq(tmp1, tmp1);
6543     jcc(Assembler::zero, SAME_TILL_END);
6544 
6545     //bind(VECTOR64_TAIL);
6546     // AVX512 code to compare up to 63 byte vectors.
6547     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6548     shlxq(tmp2, tmp2, tmp1);
6549     notq(tmp2);
6550     kmovql(k3, tmp2);
6551 
6552     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6553     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6554 
6555     ktestql(k7, k3);
6556     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6557 
6558     bind(VECTOR64_NOT_EQUAL);
6559     kmovql(tmp1, k7);
6560     notq(tmp1);
6561     tzcntq(tmp1, tmp1);
6562     addq(result, tmp1);
6563     shrq(result);
6564     jmp(DONE);
6565     bind(VECTOR32_TAIL);
6566   }
6567 
6568   cmpq(length, 8);
6569   jcc(Assembler::equal, VECTOR8_LOOP);
6570   jcc(Assembler::less, VECTOR4_TAIL);
6571 
6572   if (UseAVX >= 2) {
6573     Label VECTOR16_TAIL, VECTOR32_LOOP;
6574 
6575     cmpq(length, 16);
6576     jcc(Assembler::equal, VECTOR16_LOOP);
6577     jcc(Assembler::less, VECTOR8_LOOP);
6578 
6579     cmpq(length, 32);
6580     jccb(Assembler::less, VECTOR16_TAIL);
6581 
6582     subq(length, 32);
6583     bind(VECTOR32_LOOP);
6584     vmovdqu(rymm0, Address(obja, result));
6585     vmovdqu(rymm1, Address(objb, result));
6586     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6587     vptest(rymm2, rymm2);
6588     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6589     addq(result, 32);
6590     subq(length, 32);
6591     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6592     addq(length, 32);
6593     jcc(Assembler::equal, SAME_TILL_END);
6594     //falling through if less than 32 bytes left //close the branch here.
6595 
6596     bind(VECTOR16_TAIL);
6597     cmpq(length, 16);
6598     jccb(Assembler::less, VECTOR8_TAIL);
6599     bind(VECTOR16_LOOP);
6600     movdqu(rymm0, Address(obja, result));
6601     movdqu(rymm1, Address(objb, result));
6602     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6603     ptest(rymm2, rymm2);
6604     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6605     addq(result, 16);
6606     subq(length, 16);
6607     jcc(Assembler::equal, SAME_TILL_END);
6608     //falling through if less than 16 bytes left
6609   } else {//regular intrinsics
6610 
6611     cmpq(length, 16);
6612     jccb(Assembler::less, VECTOR8_TAIL);
6613 
6614     subq(length, 16);
6615     bind(VECTOR16_LOOP);
6616     movdqu(rymm0, Address(obja, result));
6617     movdqu(rymm1, Address(objb, result));
6618     pxor(rymm0, rymm1);
6619     ptest(rymm0, rymm0);
6620     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6621     addq(result, 16);
6622     subq(length, 16);
6623     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6624     addq(length, 16);
6625     jcc(Assembler::equal, SAME_TILL_END);
6626     //falling through if less than 16 bytes left
6627   }
6628 
6629   bind(VECTOR8_TAIL);
6630   cmpq(length, 8);
6631   jccb(Assembler::less, VECTOR4_TAIL);
6632   bind(VECTOR8_LOOP);
6633   movq(tmp1, Address(obja, result));
6634   movq(tmp2, Address(objb, result));
6635   xorq(tmp1, tmp2);
6636   testq(tmp1, tmp1);
6637   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6638   addq(result, 8);
6639   subq(length, 8);
6640   jcc(Assembler::equal, SAME_TILL_END);
6641   //falling through if less than 8 bytes left
6642 
6643   bind(VECTOR4_TAIL);
6644   cmpq(length, 4);
6645   jccb(Assembler::less, BYTES_TAIL);
6646   bind(VECTOR4_LOOP);
6647   movl(tmp1, Address(obja, result));
6648   xorl(tmp1, Address(objb, result));
6649   testl(tmp1, tmp1);
6650   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6651   addq(result, 4);
6652   subq(length, 4);
6653   jcc(Assembler::equal, SAME_TILL_END);
6654   //falling through if less than 4 bytes left
6655 
6656   bind(BYTES_TAIL);
6657   bind(BYTES_LOOP);
6658   load_unsigned_byte(tmp1, Address(obja, result));
6659   load_unsigned_byte(tmp2, Address(objb, result));
6660   xorl(tmp1, tmp2);
6661   testl(tmp1, tmp1);
6662   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6663   decq(length);
6664   jcc(Assembler::zero, SAME_TILL_END);
6665   incq(result);
6666   load_unsigned_byte(tmp1, Address(obja, result));
6667   load_unsigned_byte(tmp2, Address(objb, result));
6668   xorl(tmp1, tmp2);
6669   testl(tmp1, tmp1);
6670   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6671   decq(length);
6672   jcc(Assembler::zero, SAME_TILL_END);
6673   incq(result);
6674   load_unsigned_byte(tmp1, Address(obja, result));
6675   load_unsigned_byte(tmp2, Address(objb, result));
6676   xorl(tmp1, tmp2);
6677   testl(tmp1, tmp1);
6678   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6679   jmp(SAME_TILL_END);
6680 
6681   if (UseAVX >= 2) {
6682     bind(VECTOR32_NOT_EQUAL);
6683     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6684     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6685     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6686     vpmovmskb(tmp1, rymm0);
6687     bsfq(tmp1, tmp1);
6688     addq(result, tmp1);
6689     shrq(result);
6690     jmp(DONE);
6691   }
6692 
6693   bind(VECTOR16_NOT_EQUAL);
6694   if (UseAVX >= 2) {
6695     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6696     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6697     pxor(rymm0, rymm2);
6698   } else {
6699     pcmpeqb(rymm2, rymm2);
6700     pxor(rymm0, rymm1);
6701     pcmpeqb(rymm0, rymm1);
6702     pxor(rymm0, rymm2);
6703   }
6704   pmovmskb(tmp1, rymm0);
6705   bsfq(tmp1, tmp1);
6706   addq(result, tmp1);
6707   shrq(result);
6708   jmpb(DONE);
6709 
6710   bind(VECTOR8_NOT_EQUAL);
6711   bind(VECTOR4_NOT_EQUAL);
6712   bsfq(tmp1, tmp1);
6713   shrq(tmp1, 3);
6714   addq(result, tmp1);
6715   bind(BYTES_NOT_EQUAL);
6716   shrq(result);
6717   jmpb(DONE);
6718 
6719   bind(SAME_TILL_END);
6720   mov64(result, -1);
6721 
6722   bind(DONE);
6723 }
6724 
6725 //Helper functions for square_to_len()
6726 
6727 /**
6728  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6729  * Preserves x and z and modifies rest of the registers.
6730  */
6731 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6732   // Perform square and right shift by 1
6733   // Handle odd xlen case first, then for even xlen do the following
6734   // jlong carry = 0;
6735   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6736   //     huge_128 product = x[j:j+1] * x[j:j+1];
6737   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6738   //     z[i+2:i+3] = (jlong)(product >>> 1);
6739   //     carry = (jlong)product;
6740   // }
6741 
6742   xorq(tmp5, tmp5);     // carry
6743   xorq(rdxReg, rdxReg);
6744   xorl(tmp1, tmp1);     // index for x
6745   xorl(tmp4, tmp4);     // index for z
6746 
6747   Label L_first_loop, L_first_loop_exit;
6748 
6749   testl(xlen, 1);
6750   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6751 
6752   // Square and right shift by 1 the odd element using 32 bit multiply
6753   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6754   imulq(raxReg, raxReg);
6755   shrq(raxReg, 1);
6756   adcq(tmp5, 0);
6757   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6758   incrementl(tmp1);
6759   addl(tmp4, 2);
6760 
6761   // Square and  right shift by 1 the rest using 64 bit multiply
6762   bind(L_first_loop);
6763   cmpptr(tmp1, xlen);
6764   jccb(Assembler::equal, L_first_loop_exit);
6765 
6766   // Square
6767   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6768   rorq(raxReg, 32);    // convert big-endian to little-endian
6769   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6770 
6771   // Right shift by 1 and save carry
6772   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6773   rcrq(rdxReg, 1);
6774   rcrq(raxReg, 1);
6775   adcq(tmp5, 0);
6776 
6777   // Store result in z
6778   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6779   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6780 
6781   // Update indices for x and z
6782   addl(tmp1, 2);
6783   addl(tmp4, 4);
6784   jmp(L_first_loop);
6785 
6786   bind(L_first_loop_exit);
6787 }
6788 
6789 
6790 /**
6791  * Perform the following multiply add operation using BMI2 instructions
6792  * carry:sum = sum + op1*op2 + carry
6793  * op2 should be in rdx
6794  * op2 is preserved, all other registers are modified
6795  */
6796 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6797   // assert op2 is rdx
6798   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
6799   addq(sum, carry);
6800   adcq(tmp2, 0);
6801   addq(sum, op1);
6802   adcq(tmp2, 0);
6803   movq(carry, tmp2);
6804 }
6805 
6806 /**
6807  * Perform the following multiply add operation:
6808  * carry:sum = sum + op1*op2 + carry
6809  * Preserves op1, op2 and modifies rest of registers
6810  */
6811 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6812   // rdx:rax = op1 * op2
6813   movq(raxReg, op2);
6814   mulq(op1);
6815 
6816   //  rdx:rax = sum + carry + rdx:rax
6817   addq(sum, carry);
6818   adcq(rdxReg, 0);
6819   addq(sum, raxReg);
6820   adcq(rdxReg, 0);
6821 
6822   // carry:sum = rdx:sum
6823   movq(carry, rdxReg);
6824 }
6825 
6826 /**
6827  * Add 64 bit long carry into z[] with carry propagation.
6828  * Preserves z and carry register values and modifies rest of registers.
6829  *
6830  */
6831 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6832   Label L_fourth_loop, L_fourth_loop_exit;
6833 
6834   movl(tmp1, 1);
6835   subl(zlen, 2);
6836   addq(Address(z, zlen, Address::times_4, 0), carry);
6837 
6838   bind(L_fourth_loop);
6839   jccb(Assembler::carryClear, L_fourth_loop_exit);
6840   subl(zlen, 2);
6841   jccb(Assembler::negative, L_fourth_loop_exit);
6842   addq(Address(z, zlen, Address::times_4, 0), tmp1);
6843   jmp(L_fourth_loop);
6844   bind(L_fourth_loop_exit);
6845 }
6846 
6847 /**
6848  * Shift z[] left by 1 bit.
6849  * Preserves x, len, z and zlen registers and modifies rest of the registers.
6850  *
6851  */
6852 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6853 
6854   Label L_fifth_loop, L_fifth_loop_exit;
6855 
6856   // Fifth loop
6857   // Perform primitiveLeftShift(z, zlen, 1)
6858 
6859   const Register prev_carry = tmp1;
6860   const Register new_carry = tmp4;
6861   const Register value = tmp2;
6862   const Register zidx = tmp3;
6863 
6864   // int zidx, carry;
6865   // long value;
6866   // carry = 0;
6867   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6868   //    (carry:value)  = (z[i] << 1) | carry ;
6869   //    z[i] = value;
6870   // }
6871 
6872   movl(zidx, zlen);
6873   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6874 
6875   bind(L_fifth_loop);
6876   decl(zidx);  // Use decl to preserve carry flag
6877   decl(zidx);
6878   jccb(Assembler::negative, L_fifth_loop_exit);
6879 
6880   if (UseBMI2Instructions) {
6881      movq(value, Address(z, zidx, Address::times_4, 0));
6882      rclq(value, 1);
6883      rorxq(value, value, 32);
6884      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6885   }
6886   else {
6887     // clear new_carry
6888     xorl(new_carry, new_carry);
6889 
6890     // Shift z[i] by 1, or in previous carry and save new carry
6891     movq(value, Address(z, zidx, Address::times_4, 0));
6892     shlq(value, 1);
6893     adcl(new_carry, 0);
6894 
6895     orq(value, prev_carry);
6896     rorq(value, 0x20);
6897     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6898 
6899     // Set previous carry = new carry
6900     movl(prev_carry, new_carry);
6901   }
6902   jmp(L_fifth_loop);
6903 
6904   bind(L_fifth_loop_exit);
6905 }
6906 
6907 
6908 /**
6909  * Code for BigInteger::squareToLen() intrinsic
6910  *
6911  * rdi: x
6912  * rsi: len
6913  * r8:  z
6914  * rcx: zlen
6915  * r12: tmp1
6916  * r13: tmp2
6917  * r14: tmp3
6918  * r15: tmp4
6919  * rbx: tmp5
6920  *
6921  */
6922 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6923 
6924   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6925   push(tmp1);
6926   push(tmp2);
6927   push(tmp3);
6928   push(tmp4);
6929   push(tmp5);
6930 
6931   // First loop
6932   // Store the squares, right shifted one bit (i.e., divided by 2).
6933   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6934 
6935   // Add in off-diagonal sums.
6936   //
6937   // Second, third (nested) and fourth loops.
6938   // zlen +=2;
6939   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6940   //    carry = 0;
6941   //    long op2 = x[xidx:xidx+1];
6942   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6943   //       k -= 2;
6944   //       long op1 = x[j:j+1];
6945   //       long sum = z[k:k+1];
6946   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6947   //       z[k:k+1] = sum;
6948   //    }
6949   //    add_one_64(z, k, carry, tmp_regs);
6950   // }
6951 
6952   const Register carry = tmp5;
6953   const Register sum = tmp3;
6954   const Register op1 = tmp4;
6955   Register op2 = tmp2;
6956 
6957   push(zlen);
6958   push(len);
6959   addl(zlen,2);
6960   bind(L_second_loop);
6961   xorq(carry, carry);
6962   subl(zlen, 4);
6963   subl(len, 2);
6964   push(zlen);
6965   push(len);
6966   cmpl(len, 0);
6967   jccb(Assembler::lessEqual, L_second_loop_exit);
6968 
6969   // Multiply an array by one 64 bit long.
6970   if (UseBMI2Instructions) {
6971     op2 = rdxReg;
6972     movq(op2, Address(x, len, Address::times_4,  0));
6973     rorxq(op2, op2, 32);
6974   }
6975   else {
6976     movq(op2, Address(x, len, Address::times_4,  0));
6977     rorq(op2, 32);
6978   }
6979 
6980   bind(L_third_loop);
6981   decrementl(len);
6982   jccb(Assembler::negative, L_third_loop_exit);
6983   decrementl(len);
6984   jccb(Assembler::negative, L_last_x);
6985 
6986   movq(op1, Address(x, len, Address::times_4,  0));
6987   rorq(op1, 32);
6988 
6989   bind(L_multiply);
6990   subl(zlen, 2);
6991   movq(sum, Address(z, zlen, Address::times_4,  0));
6992 
6993   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6994   if (UseBMI2Instructions) {
6995     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6996   }
6997   else {
6998     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6999   }
7000 
7001   movq(Address(z, zlen, Address::times_4, 0), sum);
7002 
7003   jmp(L_third_loop);
7004   bind(L_third_loop_exit);
7005 
7006   // Fourth loop
7007   // Add 64 bit long carry into z with carry propagation.
7008   // Uses offsetted zlen.
7009   add_one_64(z, zlen, carry, tmp1);
7010 
7011   pop(len);
7012   pop(zlen);
7013   jmp(L_second_loop);
7014 
7015   // Next infrequent code is moved outside loops.
7016   bind(L_last_x);
7017   movl(op1, Address(x, 0));
7018   jmp(L_multiply);
7019 
7020   bind(L_second_loop_exit);
7021   pop(len);
7022   pop(zlen);
7023   pop(len);
7024   pop(zlen);
7025 
7026   // Fifth loop
7027   // Shift z left 1 bit.
7028   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7029 
7030   // z[zlen-1] |= x[len-1] & 1;
7031   movl(tmp3, Address(x, len, Address::times_4, -4));
7032   andl(tmp3, 1);
7033   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
7034 
7035   pop(tmp5);
7036   pop(tmp4);
7037   pop(tmp3);
7038   pop(tmp2);
7039   pop(tmp1);
7040 }
7041 
7042 /**
7043  * Helper function for mul_add()
7044  * Multiply the in[] by int k and add to out[] starting at offset offs using
7045  * 128 bit by 32 bit multiply and return the carry in tmp5.
7046  * Only quad int aligned length of in[] is operated on in this function.
7047  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7048  * This function preserves out, in and k registers.
7049  * len and offset point to the appropriate index in "in" & "out" correspondingly
7050  * tmp5 has the carry.
7051  * other registers are temporary and are modified.
7052  *
7053  */
7054 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7055   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7056   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7057 
7058   Label L_first_loop, L_first_loop_exit;
7059 
7060   movl(tmp1, len);
7061   shrl(tmp1, 2);
7062 
7063   bind(L_first_loop);
7064   subl(tmp1, 1);
7065   jccb(Assembler::negative, L_first_loop_exit);
7066 
7067   subl(len, 4);
7068   subl(offset, 4);
7069 
7070   Register op2 = tmp2;
7071   const Register sum = tmp3;
7072   const Register op1 = tmp4;
7073   const Register carry = tmp5;
7074 
7075   if (UseBMI2Instructions) {
7076     op2 = rdxReg;
7077   }
7078 
7079   movq(op1, Address(in, len, Address::times_4,  8));
7080   rorq(op1, 32);
7081   movq(sum, Address(out, offset, Address::times_4,  8));
7082   rorq(sum, 32);
7083   if (UseBMI2Instructions) {
7084     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7085   }
7086   else {
7087     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7088   }
7089   // Store back in big endian from little endian
7090   rorq(sum, 0x20);
7091   movq(Address(out, offset, Address::times_4,  8), sum);
7092 
7093   movq(op1, Address(in, len, Address::times_4,  0));
7094   rorq(op1, 32);
7095   movq(sum, Address(out, offset, Address::times_4,  0));
7096   rorq(sum, 32);
7097   if (UseBMI2Instructions) {
7098     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7099   }
7100   else {
7101     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7102   }
7103   // Store back in big endian from little endian
7104   rorq(sum, 0x20);
7105   movq(Address(out, offset, Address::times_4,  0), sum);
7106 
7107   jmp(L_first_loop);
7108   bind(L_first_loop_exit);
7109 }
7110 
7111 /**
7112  * Code for BigInteger::mulAdd() intrinsic
7113  *
7114  * rdi: out
7115  * rsi: in
7116  * r11: offs (out.length - offset)
7117  * rcx: len
7118  * r8:  k
7119  * r12: tmp1
7120  * r13: tmp2
7121  * r14: tmp3
7122  * r15: tmp4
7123  * rbx: tmp5
7124  * Multiply the in[] by word k and add to out[], return the carry in rax
7125  */
7126 void MacroAssembler::mul_add(Register out, Register in, Register offs,
7127    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
7128    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7129 
7130   Label L_carry, L_last_in, L_done;
7131 
7132 // carry = 0;
7133 // for (int j=len-1; j >= 0; j--) {
7134 //    long product = (in[j] & LONG_MASK) * kLong +
7135 //                   (out[offs] & LONG_MASK) + carry;
7136 //    out[offs--] = (int)product;
7137 //    carry = product >>> 32;
7138 // }
7139 //
7140   push(tmp1);
7141   push(tmp2);
7142   push(tmp3);
7143   push(tmp4);
7144   push(tmp5);
7145 
7146   Register op2 = tmp2;
7147   const Register sum = tmp3;
7148   const Register op1 = tmp4;
7149   const Register carry =  tmp5;
7150 
7151   if (UseBMI2Instructions) {
7152     op2 = rdxReg;
7153     movl(op2, k);
7154   }
7155   else {
7156     movl(op2, k);
7157   }
7158 
7159   xorq(carry, carry);
7160 
7161   //First loop
7162 
7163   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7164   //The carry is in tmp5
7165   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7166 
7167   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7168   decrementl(len);
7169   jccb(Assembler::negative, L_carry);
7170   decrementl(len);
7171   jccb(Assembler::negative, L_last_in);
7172 
7173   movq(op1, Address(in, len, Address::times_4,  0));
7174   rorq(op1, 32);
7175 
7176   subl(offs, 2);
7177   movq(sum, Address(out, offs, Address::times_4,  0));
7178   rorq(sum, 32);
7179 
7180   if (UseBMI2Instructions) {
7181     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7182   }
7183   else {
7184     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7185   }
7186 
7187   // Store back in big endian from little endian
7188   rorq(sum, 0x20);
7189   movq(Address(out, offs, Address::times_4,  0), sum);
7190 
7191   testl(len, len);
7192   jccb(Assembler::zero, L_carry);
7193 
7194   //Multiply the last in[] entry, if any
7195   bind(L_last_in);
7196   movl(op1, Address(in, 0));
7197   movl(sum, Address(out, offs, Address::times_4,  -4));
7198 
7199   movl(raxReg, k);
7200   mull(op1); //tmp4 * eax -> edx:eax
7201   addl(sum, carry);
7202   adcl(rdxReg, 0);
7203   addl(sum, raxReg);
7204   adcl(rdxReg, 0);
7205   movl(carry, rdxReg);
7206 
7207   movl(Address(out, offs, Address::times_4,  -4), sum);
7208 
7209   bind(L_carry);
7210   //return tmp5/carry as carry in rax
7211   movl(rax, carry);
7212 
7213   bind(L_done);
7214   pop(tmp5);
7215   pop(tmp4);
7216   pop(tmp3);
7217   pop(tmp2);
7218   pop(tmp1);
7219 }
7220 #endif
7221 
7222 /**
7223  * Emits code to update CRC-32 with a byte value according to constants in table
7224  *
7225  * @param [in,out]crc   Register containing the crc.
7226  * @param [in]val       Register containing the byte to fold into the CRC.
7227  * @param [in]table     Register containing the table of crc constants.
7228  *
7229  * uint32_t crc;
7230  * val = crc_table[(val ^ crc) & 0xFF];
7231  * crc = val ^ (crc >> 8);
7232  *
7233  */
7234 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7235   xorl(val, crc);
7236   andl(val, 0xFF);
7237   shrl(crc, 8); // unsigned shift
7238   xorl(crc, Address(table, val, Address::times_4, 0));
7239 }
7240 
7241 /**
7242  * Fold 128-bit data chunk
7243  */
7244 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7245   if (UseAVX > 0) {
7246     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7247     vpclmulldq(xcrc, xK, xcrc); // [63:0]
7248     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7249     pxor(xcrc, xtmp);
7250   } else {
7251     movdqa(xtmp, xcrc);
7252     pclmulhdq(xtmp, xK);   // [123:64]
7253     pclmulldq(xcrc, xK);   // [63:0]
7254     pxor(xcrc, xtmp);
7255     movdqu(xtmp, Address(buf, offset));
7256     pxor(xcrc, xtmp);
7257   }
7258 }
7259 
7260 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7261   if (UseAVX > 0) {
7262     vpclmulhdq(xtmp, xK, xcrc);
7263     vpclmulldq(xcrc, xK, xcrc);
7264     pxor(xcrc, xbuf);
7265     pxor(xcrc, xtmp);
7266   } else {
7267     movdqa(xtmp, xcrc);
7268     pclmulhdq(xtmp, xK);
7269     pclmulldq(xcrc, xK);
7270     pxor(xcrc, xbuf);
7271     pxor(xcrc, xtmp);
7272   }
7273 }
7274 
7275 /**
7276  * 8-bit folds to compute 32-bit CRC
7277  *
7278  * uint64_t xcrc;
7279  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7280  */
7281 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7282   movdl(tmp, xcrc);
7283   andl(tmp, 0xFF);
7284   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7285   psrldq(xcrc, 1); // unsigned shift one byte
7286   pxor(xcrc, xtmp);
7287 }
7288 
7289 /**
7290  * uint32_t crc;
7291  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7292  */
7293 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7294   movl(tmp, crc);
7295   andl(tmp, 0xFF);
7296   shrl(crc, 8);
7297   xorl(crc, Address(table, tmp, Address::times_4, 0));
7298 }
7299 
7300 /**
7301  * @param crc   register containing existing CRC (32-bit)
7302  * @param buf   register pointing to input byte buffer (byte*)
7303  * @param len   register containing number of bytes
7304  * @param table register that will contain address of CRC table
7305  * @param tmp   scratch register
7306  */
7307 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7308   assert_different_registers(crc, buf, len, table, tmp, rax);
7309 
7310   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7311   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7312 
7313   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7314   // context for the registers used, where all instructions below are using 128-bit mode
7315   // On EVEX without VL and BW, these instructions will all be AVX.
7316   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7317   notl(crc); // ~crc
7318   cmpl(len, 16);
7319   jcc(Assembler::less, L_tail);
7320 
7321   // Align buffer to 16 bytes
7322   movl(tmp, buf);
7323   andl(tmp, 0xF);
7324   jccb(Assembler::zero, L_aligned);
7325   subl(tmp,  16);
7326   addl(len, tmp);
7327 
7328   align(4);
7329   BIND(L_align_loop);
7330   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7331   update_byte_crc32(crc, rax, table);
7332   increment(buf);
7333   incrementl(tmp);
7334   jccb(Assembler::less, L_align_loop);
7335 
7336   BIND(L_aligned);
7337   movl(tmp, len); // save
7338   shrl(len, 4);
7339   jcc(Assembler::zero, L_tail_restore);
7340 
7341   // Fold crc into first bytes of vector
7342   movdqa(xmm1, Address(buf, 0));
7343   movdl(rax, xmm1);
7344   xorl(crc, rax);
7345   if (VM_Version::supports_sse4_1()) {
7346     pinsrd(xmm1, crc, 0);
7347   } else {
7348     pinsrw(xmm1, crc, 0);
7349     shrl(crc, 16);
7350     pinsrw(xmm1, crc, 1);
7351   }
7352   addptr(buf, 16);
7353   subl(len, 4); // len > 0
7354   jcc(Assembler::less, L_fold_tail);
7355 
7356   movdqa(xmm2, Address(buf,  0));
7357   movdqa(xmm3, Address(buf, 16));
7358   movdqa(xmm4, Address(buf, 32));
7359   addptr(buf, 48);
7360   subl(len, 3);
7361   jcc(Assembler::lessEqual, L_fold_512b);
7362 
7363   // Fold total 512 bits of polynomial on each iteration,
7364   // 128 bits per each of 4 parallel streams.
7365   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7366 
7367   align32();
7368   BIND(L_fold_512b_loop);
7369   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7370   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7371   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7372   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7373   addptr(buf, 64);
7374   subl(len, 4);
7375   jcc(Assembler::greater, L_fold_512b_loop);
7376 
7377   // Fold 512 bits to 128 bits.
7378   BIND(L_fold_512b);
7379   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7380   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7381   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7382   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7383 
7384   // Fold the rest of 128 bits data chunks
7385   BIND(L_fold_tail);
7386   addl(len, 3);
7387   jccb(Assembler::lessEqual, L_fold_128b);
7388   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7389 
7390   BIND(L_fold_tail_loop);
7391   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7392   addptr(buf, 16);
7393   decrementl(len);
7394   jccb(Assembler::greater, L_fold_tail_loop);
7395 
7396   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7397   BIND(L_fold_128b);
7398   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7399   if (UseAVX > 0) {
7400     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7401     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7402     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7403   } else {
7404     movdqa(xmm2, xmm0);
7405     pclmulqdq(xmm2, xmm1, 0x1);
7406     movdqa(xmm3, xmm0);
7407     pand(xmm3, xmm2);
7408     pclmulqdq(xmm0, xmm3, 0x1);
7409   }
7410   psrldq(xmm1, 8);
7411   psrldq(xmm2, 4);
7412   pxor(xmm0, xmm1);
7413   pxor(xmm0, xmm2);
7414 
7415   // 8 8-bit folds to compute 32-bit CRC.
7416   for (int j = 0; j < 4; j++) {
7417     fold_8bit_crc32(xmm0, table, xmm1, rax);
7418   }
7419   movdl(crc, xmm0); // mov 32 bits to general register
7420   for (int j = 0; j < 4; j++) {
7421     fold_8bit_crc32(crc, table, rax);
7422   }
7423 
7424   BIND(L_tail_restore);
7425   movl(len, tmp); // restore
7426   BIND(L_tail);
7427   andl(len, 0xf);
7428   jccb(Assembler::zero, L_exit);
7429 
7430   // Fold the rest of bytes
7431   align(4);
7432   BIND(L_tail_loop);
7433   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7434   update_byte_crc32(crc, rax, table);
7435   increment(buf);
7436   decrementl(len);
7437   jccb(Assembler::greater, L_tail_loop);
7438 
7439   BIND(L_exit);
7440   notl(crc); // ~c
7441 }
7442 
7443 #ifdef _LP64
7444 // Helper function for AVX 512 CRC32
7445 // Fold 512-bit data chunks
7446 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7447                                              Register pos, int offset) {
7448   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7449   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7450   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7451   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7452   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7453 }
7454 
7455 // Helper function for AVX 512 CRC32
7456 // Compute CRC32 for < 256B buffers
7457 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7458                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7459                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7460 
7461   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7462   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7463   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7464 
7465   // check if there is enough buffer to be able to fold 16B at a time
7466   cmpl(len, 32);
7467   jcc(Assembler::less, L_less_than_32);
7468 
7469   // if there is, load the constants
7470   movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
7471   movdl(xmm0, crc);                        // get the initial crc value
7472   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7473   pxor(xmm7, xmm0);
7474 
7475   // update the buffer pointer
7476   addl(pos, 16);
7477   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7478   subl(len, 32);
7479   jmp(L_16B_reduction_loop);
7480 
7481   bind(L_less_than_32);
7482   //mov initial crc to the return value. this is necessary for zero - length buffers.
7483   movl(rax, crc);
7484   testl(len, len);
7485   jcc(Assembler::equal, L_cleanup);
7486 
7487   movdl(xmm0, crc);                        //get the initial crc value
7488 
7489   cmpl(len, 16);
7490   jcc(Assembler::equal, L_exact_16_left);
7491   jcc(Assembler::less, L_less_than_16_left);
7492 
7493   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7494   pxor(xmm7, xmm0);                       //xor the initial crc value
7495   addl(pos, 16);
7496   subl(len, 16);
7497   movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
7498   jmp(L_get_last_two_xmms);
7499 
7500   bind(L_less_than_16_left);
7501   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7502   pxor(xmm1, xmm1);
7503   movptr(tmp1, rsp);
7504   movdqu(Address(tmp1, 0 * 16), xmm1);
7505 
7506   cmpl(len, 4);
7507   jcc(Assembler::less, L_only_less_than_4);
7508 
7509   //backup the counter value
7510   movl(tmp2, len);
7511   cmpl(len, 8);
7512   jcc(Assembler::less, L_less_than_8_left);
7513 
7514   //load 8 Bytes
7515   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7516   movq(Address(tmp1, 0 * 16), rax);
7517   addptr(tmp1, 8);
7518   subl(len, 8);
7519   addl(pos, 8);
7520 
7521   bind(L_less_than_8_left);
7522   cmpl(len, 4);
7523   jcc(Assembler::less, L_less_than_4_left);
7524 
7525   //load 4 Bytes
7526   movl(rax, Address(buf, pos, Address::times_1, 0));
7527   movl(Address(tmp1, 0 * 16), rax);
7528   addptr(tmp1, 4);
7529   subl(len, 4);
7530   addl(pos, 4);
7531 
7532   bind(L_less_than_4_left);
7533   cmpl(len, 2);
7534   jcc(Assembler::less, L_less_than_2_left);
7535 
7536   // load 2 Bytes
7537   movw(rax, Address(buf, pos, Address::times_1, 0));
7538   movl(Address(tmp1, 0 * 16), rax);
7539   addptr(tmp1, 2);
7540   subl(len, 2);
7541   addl(pos, 2);
7542 
7543   bind(L_less_than_2_left);
7544   cmpl(len, 1);
7545   jcc(Assembler::less, L_zero_left);
7546 
7547   // load 1 Byte
7548   movb(rax, Address(buf, pos, Address::times_1, 0));
7549   movb(Address(tmp1, 0 * 16), rax);
7550 
7551   bind(L_zero_left);
7552   movdqu(xmm7, Address(rsp, 0));
7553   pxor(xmm7, xmm0);                       //xor the initial crc value
7554 
7555   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7556   movdqu(xmm0, Address(rax, tmp2));
7557   pshufb(xmm7, xmm0);
7558   jmp(L_128_done);
7559 
7560   bind(L_exact_16_left);
7561   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7562   pxor(xmm7, xmm0);                       //xor the initial crc value
7563   jmp(L_128_done);
7564 
7565   bind(L_only_less_than_4);
7566   cmpl(len, 3);
7567   jcc(Assembler::less, L_only_less_than_3);
7568 
7569   // load 3 Bytes
7570   movb(rax, Address(buf, pos, Address::times_1, 0));
7571   movb(Address(tmp1, 0), rax);
7572 
7573   movb(rax, Address(buf, pos, Address::times_1, 1));
7574   movb(Address(tmp1, 1), rax);
7575 
7576   movb(rax, Address(buf, pos, Address::times_1, 2));
7577   movb(Address(tmp1, 2), rax);
7578 
7579   movdqu(xmm7, Address(rsp, 0));
7580   pxor(xmm7, xmm0);                     //xor the initial crc value
7581 
7582   pslldq(xmm7, 0x5);
7583   jmp(L_barrett);
7584   bind(L_only_less_than_3);
7585   cmpl(len, 2);
7586   jcc(Assembler::less, L_only_less_than_2);
7587 
7588   // load 2 Bytes
7589   movb(rax, Address(buf, pos, Address::times_1, 0));
7590   movb(Address(tmp1, 0), rax);
7591 
7592   movb(rax, Address(buf, pos, Address::times_1, 1));
7593   movb(Address(tmp1, 1), rax);
7594 
7595   movdqu(xmm7, Address(rsp, 0));
7596   pxor(xmm7, xmm0);                     //xor the initial crc value
7597 
7598   pslldq(xmm7, 0x6);
7599   jmp(L_barrett);
7600 
7601   bind(L_only_less_than_2);
7602   //load 1 Byte
7603   movb(rax, Address(buf, pos, Address::times_1, 0));
7604   movb(Address(tmp1, 0), rax);
7605 
7606   movdqu(xmm7, Address(rsp, 0));
7607   pxor(xmm7, xmm0);                     //xor the initial crc value
7608 
7609   pslldq(xmm7, 0x7);
7610 }
7611 
7612 /**
7613 * Compute CRC32 using AVX512 instructions
7614 * param crc   register containing existing CRC (32-bit)
7615 * param buf   register pointing to input byte buffer (byte*)
7616 * param len   register containing number of bytes
7617 * param table address of crc or crc32c table
7618 * param tmp1  scratch register
7619 * param tmp2  scratch register
7620 * return rax  result register
7621 *
7622 * This routine is identical for crc32c with the exception of the precomputed constant
7623 * table which will be passed as the table argument.  The calculation steps are
7624 * the same for both variants.
7625 */
7626 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7627   assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7628 
7629   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7630   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7631   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7632   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7633   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7634 
7635   const Register pos = r12;
7636   push(r12);
7637   subptr(rsp, 16 * 2 + 8);
7638 
7639   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7640   // context for the registers used, where all instructions below are using 128-bit mode
7641   // On EVEX without VL and BW, these instructions will all be AVX.
7642   movl(pos, 0);
7643 
7644   // check if smaller than 256B
7645   cmpl(len, 256);
7646   jcc(Assembler::less, L_less_than_256);
7647 
7648   // load the initial crc value
7649   movdl(xmm10, crc);
7650 
7651   // receive the initial 64B data, xor the initial crc value
7652   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7653   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7654   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7655   evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7656 
7657   subl(len, 256);
7658   cmpl(len, 256);
7659   jcc(Assembler::less, L_fold_128_B_loop);
7660 
7661   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7662   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7663   evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7664   subl(len, 256);
7665 
7666   bind(L_fold_256_B_loop);
7667   addl(pos, 256);
7668   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7669   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7670   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7671   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7672 
7673   subl(len, 256);
7674   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7675 
7676   // Fold 256 into 128
7677   addl(pos, 256);
7678   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7679   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7680   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7681 
7682   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7683   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7684   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7685 
7686   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7687   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7688 
7689   addl(len, 128);
7690   jmp(L_fold_128_B_register);
7691 
7692   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7693   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7694 
7695   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7696   bind(L_fold_128_B_loop);
7697   addl(pos, 128);
7698   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7699   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7700 
7701   subl(len, 128);
7702   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7703 
7704   addl(pos, 128);
7705 
7706   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7707   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7708   bind(L_fold_128_B_register);
7709   evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7710   evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7711   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7712   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7713   // save last that has no multiplicand
7714   vextracti64x2(xmm7, xmm4, 3);
7715 
7716   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7717   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7718   // Needed later in reduction loop
7719   movdqu(xmm10, Address(table, 1 * 16));
7720   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7721   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7722 
7723   // Swap 1,0,3,2 - 01 00 11 10
7724   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7725   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7726   vextracti128(xmm5, xmm8, 1);
7727   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7728 
7729   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7730   // instead of a cmp instruction, we use the negative flag with the jl instruction
7731   addl(len, 128 - 16);
7732   jcc(Assembler::less, L_final_reduction_for_128);
7733 
7734   bind(L_16B_reduction_loop);
7735   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7736   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7737   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7738   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7739   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7740   addl(pos, 16);
7741   subl(len, 16);
7742   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7743 
7744   bind(L_final_reduction_for_128);
7745   addl(len, 16);
7746   jcc(Assembler::equal, L_128_done);
7747 
7748   bind(L_get_last_two_xmms);
7749   movdqu(xmm2, xmm7);
7750   addl(pos, len);
7751   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7752   subl(pos, len);
7753 
7754   // get rid of the extra data that was loaded before
7755   // load the shift constant
7756   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7757   movdqu(xmm0, Address(rax, len));
7758   addl(rax, len);
7759 
7760   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7761   //Change mask to 512
7762   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7763   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7764 
7765   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7766   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7767   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7768   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7769   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7770 
7771   bind(L_128_done);
7772   // compute crc of a 128-bit value
7773   movdqu(xmm10, Address(table, 3 * 16));
7774   movdqu(xmm0, xmm7);
7775 
7776   // 64b fold
7777   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7778   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7779   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7780 
7781   // 32b fold
7782   movdqu(xmm0, xmm7);
7783   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7784   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7785   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7786   jmp(L_barrett);
7787 
7788   bind(L_less_than_256);
7789   kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7790 
7791   //barrett reduction
7792   bind(L_barrett);
7793   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7794   movdqu(xmm1, xmm7);
7795   movdqu(xmm2, xmm7);
7796   movdqu(xmm10, Address(table, 4 * 16));
7797 
7798   pclmulqdq(xmm7, xmm10, 0x0);
7799   pxor(xmm7, xmm2);
7800   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7801   movdqu(xmm2, xmm7);
7802   pclmulqdq(xmm7, xmm10, 0x10);
7803   pxor(xmm7, xmm2);
7804   pxor(xmm7, xmm1);
7805   pextrd(crc, xmm7, 2);
7806 
7807   bind(L_cleanup);
7808   addptr(rsp, 16 * 2 + 8);
7809   pop(r12);
7810 }
7811 
7812 // S. Gueron / Information Processing Letters 112 (2012) 184
7813 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7814 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7815 // Output: the 64-bit carry-less product of B * CONST
7816 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7817                                      Register tmp1, Register tmp2, Register tmp3) {
7818   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7819   if (n > 0) {
7820     addq(tmp3, n * 256 * 8);
7821   }
7822   //    Q1 = TABLEExt[n][B & 0xFF];
7823   movl(tmp1, in);
7824   andl(tmp1, 0x000000FF);
7825   shll(tmp1, 3);
7826   addq(tmp1, tmp3);
7827   movq(tmp1, Address(tmp1, 0));
7828 
7829   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7830   movl(tmp2, in);
7831   shrl(tmp2, 8);
7832   andl(tmp2, 0x000000FF);
7833   shll(tmp2, 3);
7834   addq(tmp2, tmp3);
7835   movq(tmp2, Address(tmp2, 0));
7836 
7837   shlq(tmp2, 8);
7838   xorq(tmp1, tmp2);
7839 
7840   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7841   movl(tmp2, in);
7842   shrl(tmp2, 16);
7843   andl(tmp2, 0x000000FF);
7844   shll(tmp2, 3);
7845   addq(tmp2, tmp3);
7846   movq(tmp2, Address(tmp2, 0));
7847 
7848   shlq(tmp2, 16);
7849   xorq(tmp1, tmp2);
7850 
7851   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7852   shrl(in, 24);
7853   andl(in, 0x000000FF);
7854   shll(in, 3);
7855   addq(in, tmp3);
7856   movq(in, Address(in, 0));
7857 
7858   shlq(in, 24);
7859   xorq(in, tmp1);
7860   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7861 }
7862 
7863 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7864                                       Register in_out,
7865                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7866                                       XMMRegister w_xtmp2,
7867                                       Register tmp1,
7868                                       Register n_tmp2, Register n_tmp3) {
7869   if (is_pclmulqdq_supported) {
7870     movdl(w_xtmp1, in_out); // modified blindly
7871 
7872     movl(tmp1, const_or_pre_comp_const_index);
7873     movdl(w_xtmp2, tmp1);
7874     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7875 
7876     movdq(in_out, w_xtmp1);
7877   } else {
7878     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7879   }
7880 }
7881 
7882 // Recombination Alternative 2: No bit-reflections
7883 // T1 = (CRC_A * U1) << 1
7884 // T2 = (CRC_B * U2) << 1
7885 // C1 = T1 >> 32
7886 // C2 = T2 >> 32
7887 // T1 = T1 & 0xFFFFFFFF
7888 // T2 = T2 & 0xFFFFFFFF
7889 // T1 = CRC32(0, T1)
7890 // T2 = CRC32(0, T2)
7891 // C1 = C1 ^ T1
7892 // C2 = C2 ^ T2
7893 // CRC = C1 ^ C2 ^ CRC_C
7894 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7895                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7896                                      Register tmp1, Register tmp2,
7897                                      Register n_tmp3) {
7898   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7899   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7900   shlq(in_out, 1);
7901   movl(tmp1, in_out);
7902   shrq(in_out, 32);
7903   xorl(tmp2, tmp2);
7904   crc32(tmp2, tmp1, 4);
7905   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7906   shlq(in1, 1);
7907   movl(tmp1, in1);
7908   shrq(in1, 32);
7909   xorl(tmp2, tmp2);
7910   crc32(tmp2, tmp1, 4);
7911   xorl(in1, tmp2);
7912   xorl(in_out, in1);
7913   xorl(in_out, in2);
7914 }
7915 
7916 // Set N to predefined value
7917 // Subtract from a length of a buffer
7918 // execute in a loop:
7919 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7920 // for i = 1 to N do
7921 //  CRC_A = CRC32(CRC_A, A[i])
7922 //  CRC_B = CRC32(CRC_B, B[i])
7923 //  CRC_C = CRC32(CRC_C, C[i])
7924 // end for
7925 // Recombine
7926 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7927                                        Register in_out1, Register in_out2, Register in_out3,
7928                                        Register tmp1, Register tmp2, Register tmp3,
7929                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7930                                        Register tmp4, Register tmp5,
7931                                        Register n_tmp6) {
7932   Label L_processPartitions;
7933   Label L_processPartition;
7934   Label L_exit;
7935 
7936   bind(L_processPartitions);
7937   cmpl(in_out1, 3 * size);
7938   jcc(Assembler::less, L_exit);
7939     xorl(tmp1, tmp1);
7940     xorl(tmp2, tmp2);
7941     movq(tmp3, in_out2);
7942     addq(tmp3, size);
7943 
7944     bind(L_processPartition);
7945       crc32(in_out3, Address(in_out2, 0), 8);
7946       crc32(tmp1, Address(in_out2, size), 8);
7947       crc32(tmp2, Address(in_out2, size * 2), 8);
7948       addq(in_out2, 8);
7949       cmpq(in_out2, tmp3);
7950       jcc(Assembler::less, L_processPartition);
7951     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7952             w_xtmp1, w_xtmp2, w_xtmp3,
7953             tmp4, tmp5,
7954             n_tmp6);
7955     addq(in_out2, 2 * size);
7956     subl(in_out1, 3 * size);
7957     jmp(L_processPartitions);
7958 
7959   bind(L_exit);
7960 }
7961 #else
7962 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7963                                      Register tmp1, Register tmp2, Register tmp3,
7964                                      XMMRegister xtmp1, XMMRegister xtmp2) {
7965   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7966   if (n > 0) {
7967     addl(tmp3, n * 256 * 8);
7968   }
7969   //    Q1 = TABLEExt[n][B & 0xFF];
7970   movl(tmp1, in_out);
7971   andl(tmp1, 0x000000FF);
7972   shll(tmp1, 3);
7973   addl(tmp1, tmp3);
7974   movq(xtmp1, Address(tmp1, 0));
7975 
7976   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7977   movl(tmp2, in_out);
7978   shrl(tmp2, 8);
7979   andl(tmp2, 0x000000FF);
7980   shll(tmp2, 3);
7981   addl(tmp2, tmp3);
7982   movq(xtmp2, Address(tmp2, 0));
7983 
7984   psllq(xtmp2, 8);
7985   pxor(xtmp1, xtmp2);
7986 
7987   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7988   movl(tmp2, in_out);
7989   shrl(tmp2, 16);
7990   andl(tmp2, 0x000000FF);
7991   shll(tmp2, 3);
7992   addl(tmp2, tmp3);
7993   movq(xtmp2, Address(tmp2, 0));
7994 
7995   psllq(xtmp2, 16);
7996   pxor(xtmp1, xtmp2);
7997 
7998   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7999   shrl(in_out, 24);
8000   andl(in_out, 0x000000FF);
8001   shll(in_out, 3);
8002   addl(in_out, tmp3);
8003   movq(xtmp2, Address(in_out, 0));
8004 
8005   psllq(xtmp2, 24);
8006   pxor(xtmp1, xtmp2); // Result in CXMM
8007   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8008 }
8009 
8010 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8011                                       Register in_out,
8012                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8013                                       XMMRegister w_xtmp2,
8014                                       Register tmp1,
8015                                       Register n_tmp2, Register n_tmp3) {
8016   if (is_pclmulqdq_supported) {
8017     movdl(w_xtmp1, in_out);
8018 
8019     movl(tmp1, const_or_pre_comp_const_index);
8020     movdl(w_xtmp2, tmp1);
8021     pclmulqdq(w_xtmp1, w_xtmp2, 0);
8022     // Keep result in XMM since GPR is 32 bit in length
8023   } else {
8024     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
8025   }
8026 }
8027 
8028 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8029                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8030                                      Register tmp1, Register tmp2,
8031                                      Register n_tmp3) {
8032   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8033   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8034 
8035   psllq(w_xtmp1, 1);
8036   movdl(tmp1, w_xtmp1);
8037   psrlq(w_xtmp1, 32);
8038   movdl(in_out, w_xtmp1);
8039 
8040   xorl(tmp2, tmp2);
8041   crc32(tmp2, tmp1, 4);
8042   xorl(in_out, tmp2);
8043 
8044   psllq(w_xtmp2, 1);
8045   movdl(tmp1, w_xtmp2);
8046   psrlq(w_xtmp2, 32);
8047   movdl(in1, w_xtmp2);
8048 
8049   xorl(tmp2, tmp2);
8050   crc32(tmp2, tmp1, 4);
8051   xorl(in1, tmp2);
8052   xorl(in_out, in1);
8053   xorl(in_out, in2);
8054 }
8055 
8056 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8057                                        Register in_out1, Register in_out2, Register in_out3,
8058                                        Register tmp1, Register tmp2, Register tmp3,
8059                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8060                                        Register tmp4, Register tmp5,
8061                                        Register n_tmp6) {
8062   Label L_processPartitions;
8063   Label L_processPartition;
8064   Label L_exit;
8065 
8066   bind(L_processPartitions);
8067   cmpl(in_out1, 3 * size);
8068   jcc(Assembler::less, L_exit);
8069     xorl(tmp1, tmp1);
8070     xorl(tmp2, tmp2);
8071     movl(tmp3, in_out2);
8072     addl(tmp3, size);
8073 
8074     bind(L_processPartition);
8075       crc32(in_out3, Address(in_out2, 0), 4);
8076       crc32(tmp1, Address(in_out2, size), 4);
8077       crc32(tmp2, Address(in_out2, size*2), 4);
8078       crc32(in_out3, Address(in_out2, 0+4), 4);
8079       crc32(tmp1, Address(in_out2, size+4), 4);
8080       crc32(tmp2, Address(in_out2, size*2+4), 4);
8081       addl(in_out2, 8);
8082       cmpl(in_out2, tmp3);
8083       jcc(Assembler::less, L_processPartition);
8084 
8085         push(tmp3);
8086         push(in_out1);
8087         push(in_out2);
8088         tmp4 = tmp3;
8089         tmp5 = in_out1;
8090         n_tmp6 = in_out2;
8091 
8092       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8093             w_xtmp1, w_xtmp2, w_xtmp3,
8094             tmp4, tmp5,
8095             n_tmp6);
8096 
8097         pop(in_out2);
8098         pop(in_out1);
8099         pop(tmp3);
8100 
8101     addl(in_out2, 2 * size);
8102     subl(in_out1, 3 * size);
8103     jmp(L_processPartitions);
8104 
8105   bind(L_exit);
8106 }
8107 #endif //LP64
8108 
8109 #ifdef _LP64
8110 // Algorithm 2: Pipelined usage of the CRC32 instruction.
8111 // Input: A buffer I of L bytes.
8112 // Output: the CRC32C value of the buffer.
8113 // Notations:
8114 // Write L = 24N + r, with N = floor (L/24).
8115 // r = L mod 24 (0 <= r < 24).
8116 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8117 // N quadwords, and R consists of r bytes.
8118 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8119 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8120 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8121 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8122 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8123                                           Register tmp1, Register tmp2, Register tmp3,
8124                                           Register tmp4, Register tmp5, Register tmp6,
8125                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8126                                           bool is_pclmulqdq_supported) {
8127   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8128   Label L_wordByWord;
8129   Label L_byteByByteProlog;
8130   Label L_byteByByte;
8131   Label L_exit;
8132 
8133   if (is_pclmulqdq_supported ) {
8134     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8135     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
8136 
8137     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8138     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8139 
8140     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8141     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8142     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8143   } else {
8144     const_or_pre_comp_const_index[0] = 1;
8145     const_or_pre_comp_const_index[1] = 0;
8146 
8147     const_or_pre_comp_const_index[2] = 3;
8148     const_or_pre_comp_const_index[3] = 2;
8149 
8150     const_or_pre_comp_const_index[4] = 5;
8151     const_or_pre_comp_const_index[5] = 4;
8152    }
8153   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8154                     in2, in1, in_out,
8155                     tmp1, tmp2, tmp3,
8156                     w_xtmp1, w_xtmp2, w_xtmp3,
8157                     tmp4, tmp5,
8158                     tmp6);
8159   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8160                     in2, in1, in_out,
8161                     tmp1, tmp2, tmp3,
8162                     w_xtmp1, w_xtmp2, w_xtmp3,
8163                     tmp4, tmp5,
8164                     tmp6);
8165   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8166                     in2, in1, in_out,
8167                     tmp1, tmp2, tmp3,
8168                     w_xtmp1, w_xtmp2, w_xtmp3,
8169                     tmp4, tmp5,
8170                     tmp6);
8171   movl(tmp1, in2);
8172   andl(tmp1, 0x00000007);
8173   negl(tmp1);
8174   addl(tmp1, in2);
8175   addq(tmp1, in1);
8176 
8177   BIND(L_wordByWord);
8178   cmpq(in1, tmp1);
8179   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8180     crc32(in_out, Address(in1, 0), 4);
8181     addq(in1, 4);
8182     jmp(L_wordByWord);
8183 
8184   BIND(L_byteByByteProlog);
8185   andl(in2, 0x00000007);
8186   movl(tmp2, 1);
8187 
8188   BIND(L_byteByByte);
8189   cmpl(tmp2, in2);
8190   jccb(Assembler::greater, L_exit);
8191     crc32(in_out, Address(in1, 0), 1);
8192     incq(in1);
8193     incl(tmp2);
8194     jmp(L_byteByByte);
8195 
8196   BIND(L_exit);
8197 }
8198 #else
8199 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8200                                           Register tmp1, Register  tmp2, Register tmp3,
8201                                           Register tmp4, Register  tmp5, Register tmp6,
8202                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8203                                           bool is_pclmulqdq_supported) {
8204   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8205   Label L_wordByWord;
8206   Label L_byteByByteProlog;
8207   Label L_byteByByte;
8208   Label L_exit;
8209 
8210   if (is_pclmulqdq_supported) {
8211     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8212     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
8213 
8214     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8215     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8216 
8217     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8218     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8219   } else {
8220     const_or_pre_comp_const_index[0] = 1;
8221     const_or_pre_comp_const_index[1] = 0;
8222 
8223     const_or_pre_comp_const_index[2] = 3;
8224     const_or_pre_comp_const_index[3] = 2;
8225 
8226     const_or_pre_comp_const_index[4] = 5;
8227     const_or_pre_comp_const_index[5] = 4;
8228   }
8229   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8230                     in2, in1, in_out,
8231                     tmp1, tmp2, tmp3,
8232                     w_xtmp1, w_xtmp2, w_xtmp3,
8233                     tmp4, tmp5,
8234                     tmp6);
8235   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8236                     in2, in1, in_out,
8237                     tmp1, tmp2, tmp3,
8238                     w_xtmp1, w_xtmp2, w_xtmp3,
8239                     tmp4, tmp5,
8240                     tmp6);
8241   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8242                     in2, in1, in_out,
8243                     tmp1, tmp2, tmp3,
8244                     w_xtmp1, w_xtmp2, w_xtmp3,
8245                     tmp4, tmp5,
8246                     tmp6);
8247   movl(tmp1, in2);
8248   andl(tmp1, 0x00000007);
8249   negl(tmp1);
8250   addl(tmp1, in2);
8251   addl(tmp1, in1);
8252 
8253   BIND(L_wordByWord);
8254   cmpl(in1, tmp1);
8255   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8256     crc32(in_out, Address(in1,0), 4);
8257     addl(in1, 4);
8258     jmp(L_wordByWord);
8259 
8260   BIND(L_byteByByteProlog);
8261   andl(in2, 0x00000007);
8262   movl(tmp2, 1);
8263 
8264   BIND(L_byteByByte);
8265   cmpl(tmp2, in2);
8266   jccb(Assembler::greater, L_exit);
8267     movb(tmp1, Address(in1, 0));
8268     crc32(in_out, tmp1, 1);
8269     incl(in1);
8270     incl(tmp2);
8271     jmp(L_byteByByte);
8272 
8273   BIND(L_exit);
8274 }
8275 #endif // LP64
8276 #undef BIND
8277 #undef BLOCK_COMMENT
8278 
8279 // Compress char[] array to byte[].
8280 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8281 //   @IntrinsicCandidate
8282 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8283 //     for (int i = 0; i < len; i++) {
8284 //       int c = src[srcOff++];
8285 //       if (c >>> 8 != 0) {
8286 //         return 0;
8287 //       }
8288 //       dst[dstOff++] = (byte)c;
8289 //     }
8290 //     return len;
8291 //   }
8292 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8293   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8294   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8295   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8296   Label copy_chars_loop, return_length, return_zero, done;
8297 
8298   // rsi: src
8299   // rdi: dst
8300   // rdx: len
8301   // rcx: tmp5
8302   // rax: result
8303 
8304   // rsi holds start addr of source char[] to be compressed
8305   // rdi holds start addr of destination byte[]
8306   // rdx holds length
8307 
8308   assert(len != result, "");
8309 
8310   // save length for return
8311   push(len);
8312 
8313   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8314     VM_Version::supports_avx512vlbw() &&
8315     VM_Version::supports_bmi2()) {
8316 
8317     Label copy_32_loop, copy_loop_tail, below_threshold;
8318 
8319     // alignment
8320     Label post_alignment;
8321 
8322     // if length of the string is less than 16, handle it in an old fashioned way
8323     testl(len, -32);
8324     jcc(Assembler::zero, below_threshold);
8325 
8326     // First check whether a character is compressible ( <= 0xFF).
8327     // Create mask to test for Unicode chars inside zmm vector
8328     movl(result, 0x00FF);
8329     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8330 
8331     testl(len, -64);
8332     jcc(Assembler::zero, post_alignment);
8333 
8334     movl(tmp5, dst);
8335     andl(tmp5, (32 - 1));
8336     negl(tmp5);
8337     andl(tmp5, (32 - 1));
8338 
8339     // bail out when there is nothing to be done
8340     testl(tmp5, 0xFFFFFFFF);
8341     jcc(Assembler::zero, post_alignment);
8342 
8343     // ~(~0 << len), where len is the # of remaining elements to process
8344     movl(result, 0xFFFFFFFF);
8345     shlxl(result, result, tmp5);
8346     notl(result);
8347     kmovdl(mask2, result);
8348 
8349     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8350     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8351     ktestd(mask1, mask2);
8352     jcc(Assembler::carryClear, return_zero);
8353 
8354     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8355 
8356     addptr(src, tmp5);
8357     addptr(src, tmp5);
8358     addptr(dst, tmp5);
8359     subl(len, tmp5);
8360 
8361     bind(post_alignment);
8362     // end of alignment
8363 
8364     movl(tmp5, len);
8365     andl(tmp5, (32 - 1));    // tail count (in chars)
8366     andl(len, ~(32 - 1));    // vector count (in chars)
8367     jcc(Assembler::zero, copy_loop_tail);
8368 
8369     lea(src, Address(src, len, Address::times_2));
8370     lea(dst, Address(dst, len, Address::times_1));
8371     negptr(len);
8372 
8373     bind(copy_32_loop);
8374     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
8375     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8376     kortestdl(mask1, mask1);
8377     jcc(Assembler::carryClear, return_zero);
8378 
8379     // All elements in current processed chunk are valid candidates for
8380     // compression. Write a truncated byte elements to the memory.
8381     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8382     addptr(len, 32);
8383     jcc(Assembler::notZero, copy_32_loop);
8384 
8385     bind(copy_loop_tail);
8386     // bail out when there is nothing to be done
8387     testl(tmp5, 0xFFFFFFFF);
8388     jcc(Assembler::zero, return_length);
8389 
8390     movl(len, tmp5);
8391 
8392     // ~(~0 << len), where len is the # of remaining elements to process
8393     movl(result, 0xFFFFFFFF);
8394     shlxl(result, result, len);
8395     notl(result);
8396 
8397     kmovdl(mask2, result);
8398 
8399     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8400     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8401     ktestd(mask1, mask2);
8402     jcc(Assembler::carryClear, return_zero);
8403 
8404     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8405     jmp(return_length);
8406 
8407     bind(below_threshold);
8408   }
8409 
8410   if (UseSSE42Intrinsics) {
8411     Label copy_32_loop, copy_16, copy_tail;
8412 
8413     movl(result, len);
8414 
8415     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8416 
8417     // vectored compression
8418     andl(len, 0xfffffff0);    // vector count (in chars)
8419     andl(result, 0x0000000f);    // tail count (in chars)
8420     testl(len, len);
8421     jcc(Assembler::zero, copy_16);
8422 
8423     // compress 16 chars per iter
8424     movdl(tmp1Reg, tmp5);
8425     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8426     pxor(tmp4Reg, tmp4Reg);
8427 
8428     lea(src, Address(src, len, Address::times_2));
8429     lea(dst, Address(dst, len, Address::times_1));
8430     negptr(len);
8431 
8432     bind(copy_32_loop);
8433     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8434     por(tmp4Reg, tmp2Reg);
8435     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8436     por(tmp4Reg, tmp3Reg);
8437     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8438     jcc(Assembler::notZero, return_zero);
8439     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8440     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8441     addptr(len, 16);
8442     jcc(Assembler::notZero, copy_32_loop);
8443 
8444     // compress next vector of 8 chars (if any)
8445     bind(copy_16);
8446     movl(len, result);
8447     andl(len, 0xfffffff8);    // vector count (in chars)
8448     andl(result, 0x00000007);    // tail count (in chars)
8449     testl(len, len);
8450     jccb(Assembler::zero, copy_tail);
8451 
8452     movdl(tmp1Reg, tmp5);
8453     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8454     pxor(tmp3Reg, tmp3Reg);
8455 
8456     movdqu(tmp2Reg, Address(src, 0));
8457     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8458     jccb(Assembler::notZero, return_zero);
8459     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8460     movq(Address(dst, 0), tmp2Reg);
8461     addptr(src, 16);
8462     addptr(dst, 8);
8463 
8464     bind(copy_tail);
8465     movl(len, result);
8466   }
8467   // compress 1 char per iter
8468   testl(len, len);
8469   jccb(Assembler::zero, return_length);
8470   lea(src, Address(src, len, Address::times_2));
8471   lea(dst, Address(dst, len, Address::times_1));
8472   negptr(len);
8473 
8474   bind(copy_chars_loop);
8475   load_unsigned_short(result, Address(src, len, Address::times_2));
8476   testl(result, 0xff00);      // check if Unicode char
8477   jccb(Assembler::notZero, return_zero);
8478   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8479   increment(len);
8480   jcc(Assembler::notZero, copy_chars_loop);
8481 
8482   // if compression succeeded, return length
8483   bind(return_length);
8484   pop(result);
8485   jmpb(done);
8486 
8487   // if compression failed, return 0
8488   bind(return_zero);
8489   xorl(result, result);
8490   addptr(rsp, wordSize);
8491 
8492   bind(done);
8493 }
8494 
8495 // Inflate byte[] array to char[].
8496 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8497 //   @IntrinsicCandidate
8498 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8499 //     for (int i = 0; i < len; i++) {
8500 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8501 //     }
8502 //   }
8503 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8504   XMMRegister tmp1, Register tmp2, KRegister mask) {
8505   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8506   // rsi: src
8507   // rdi: dst
8508   // rdx: len
8509   // rcx: tmp2
8510 
8511   // rsi holds start addr of source byte[] to be inflated
8512   // rdi holds start addr of destination char[]
8513   // rdx holds length
8514   assert_different_registers(src, dst, len, tmp2);
8515   movl(tmp2, len);
8516   if ((UseAVX > 2) && // AVX512
8517     VM_Version::supports_avx512vlbw() &&
8518     VM_Version::supports_bmi2()) {
8519 
8520     Label copy_32_loop, copy_tail;
8521     Register tmp3_aliased = len;
8522 
8523     // if length of the string is less than 16, handle it in an old fashioned way
8524     testl(len, -16);
8525     jcc(Assembler::zero, below_threshold);
8526 
8527     testl(len, -1 * AVX3Threshold);
8528     jcc(Assembler::zero, avx3_threshold);
8529 
8530     // In order to use only one arithmetic operation for the main loop we use
8531     // this pre-calculation
8532     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8533     andl(len, -32);     // vector count
8534     jccb(Assembler::zero, copy_tail);
8535 
8536     lea(src, Address(src, len, Address::times_1));
8537     lea(dst, Address(dst, len, Address::times_2));
8538     negptr(len);
8539 
8540 
8541     // inflate 32 chars per iter
8542     bind(copy_32_loop);
8543     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8544     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
8545     addptr(len, 32);
8546     jcc(Assembler::notZero, copy_32_loop);
8547 
8548     bind(copy_tail);
8549     // bail out when there is nothing to be done
8550     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8551     jcc(Assembler::zero, done);
8552 
8553     // ~(~0 << length), where length is the # of remaining elements to process
8554     movl(tmp3_aliased, -1);
8555     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8556     notl(tmp3_aliased);
8557     kmovdl(mask, tmp3_aliased);
8558     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8559     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8560 
8561     jmp(done);
8562     bind(avx3_threshold);
8563   }
8564   if (UseSSE42Intrinsics) {
8565     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8566 
8567     if (UseAVX > 1) {
8568       andl(tmp2, (16 - 1));
8569       andl(len, -16);
8570       jccb(Assembler::zero, copy_new_tail);
8571     } else {
8572       andl(tmp2, 0x00000007);   // tail count (in chars)
8573       andl(len, 0xfffffff8);    // vector count (in chars)
8574       jccb(Assembler::zero, copy_tail);
8575     }
8576 
8577     // vectored inflation
8578     lea(src, Address(src, len, Address::times_1));
8579     lea(dst, Address(dst, len, Address::times_2));
8580     negptr(len);
8581 
8582     if (UseAVX > 1) {
8583       bind(copy_16_loop);
8584       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8585       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8586       addptr(len, 16);
8587       jcc(Assembler::notZero, copy_16_loop);
8588 
8589       bind(below_threshold);
8590       bind(copy_new_tail);
8591       movl(len, tmp2);
8592       andl(tmp2, 0x00000007);
8593       andl(len, 0xFFFFFFF8);
8594       jccb(Assembler::zero, copy_tail);
8595 
8596       pmovzxbw(tmp1, Address(src, 0));
8597       movdqu(Address(dst, 0), tmp1);
8598       addptr(src, 8);
8599       addptr(dst, 2 * 8);
8600 
8601       jmp(copy_tail, true);
8602     }
8603 
8604     // inflate 8 chars per iter
8605     bind(copy_8_loop);
8606     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8607     movdqu(Address(dst, len, Address::times_2), tmp1);
8608     addptr(len, 8);
8609     jcc(Assembler::notZero, copy_8_loop);
8610 
8611     bind(copy_tail);
8612     movl(len, tmp2);
8613 
8614     cmpl(len, 4);
8615     jccb(Assembler::less, copy_bytes);
8616 
8617     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8618     pmovzxbw(tmp1, tmp1);
8619     movq(Address(dst, 0), tmp1);
8620     subptr(len, 4);
8621     addptr(src, 4);
8622     addptr(dst, 8);
8623 
8624     bind(copy_bytes);
8625   } else {
8626     bind(below_threshold);
8627   }
8628 
8629   testl(len, len);
8630   jccb(Assembler::zero, done);
8631   lea(src, Address(src, len, Address::times_1));
8632   lea(dst, Address(dst, len, Address::times_2));
8633   negptr(len);
8634 
8635   // inflate 1 char per iter
8636   bind(copy_chars_loop);
8637   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8638   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8639   increment(len);
8640   jcc(Assembler::notZero, copy_chars_loop);
8641 
8642   bind(done);
8643 }
8644 
8645 
8646 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
8647   switch(type) {
8648     case T_BYTE:
8649     case T_BOOLEAN:
8650       evmovdqub(dst, kmask, src, merge, vector_len);
8651       break;
8652     case T_CHAR:
8653     case T_SHORT:
8654       evmovdquw(dst, kmask, src, merge, vector_len);
8655       break;
8656     case T_INT:
8657     case T_FLOAT:
8658       evmovdqul(dst, kmask, src, merge, vector_len);
8659       break;
8660     case T_LONG:
8661     case T_DOUBLE:
8662       evmovdquq(dst, kmask, src, merge, vector_len);
8663       break;
8664     default:
8665       fatal("Unexpected type argument %s", type2name(type));
8666       break;
8667   }
8668 }
8669 
8670 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
8671   switch(type) {
8672     case T_BYTE:
8673     case T_BOOLEAN:
8674       evmovdqub(dst, kmask, src, merge, vector_len);
8675       break;
8676     case T_CHAR:
8677     case T_SHORT:
8678       evmovdquw(dst, kmask, src, merge, vector_len);
8679       break;
8680     case T_INT:
8681     case T_FLOAT:
8682       evmovdqul(dst, kmask, src, merge, vector_len);
8683       break;
8684     case T_LONG:
8685     case T_DOUBLE:
8686       evmovdquq(dst, kmask, src, merge, vector_len);
8687       break;
8688     default:
8689       fatal("Unexpected type argument %s", type2name(type));
8690       break;
8691   }
8692 }
8693 
8694 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
8695   switch(masklen) {
8696     case 2:
8697        knotbl(dst, src);
8698        movl(rtmp, 3);
8699        kmovbl(ktmp, rtmp);
8700        kandbl(dst, ktmp, dst);
8701        break;
8702     case 4:
8703        knotbl(dst, src);
8704        movl(rtmp, 15);
8705        kmovbl(ktmp, rtmp);
8706        kandbl(dst, ktmp, dst);
8707        break;
8708     case 8:
8709        knotbl(dst, src);
8710        break;
8711     case 16:
8712        knotwl(dst, src);
8713        break;
8714     case 32:
8715        knotdl(dst, src);
8716        break;
8717     case 64:
8718        knotql(dst, src);
8719        break;
8720     default:
8721       fatal("Unexpected vector length %d", masklen);
8722       break;
8723   }
8724 }
8725 
8726 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8727   switch(type) {
8728     case T_BOOLEAN:
8729     case T_BYTE:
8730        kandbl(dst, src1, src2);
8731        break;
8732     case T_CHAR:
8733     case T_SHORT:
8734        kandwl(dst, src1, src2);
8735        break;
8736     case T_INT:
8737     case T_FLOAT:
8738        kanddl(dst, src1, src2);
8739        break;
8740     case T_LONG:
8741     case T_DOUBLE:
8742        kandql(dst, src1, src2);
8743        break;
8744     default:
8745       fatal("Unexpected type argument %s", type2name(type));
8746       break;
8747   }
8748 }
8749 
8750 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8751   switch(type) {
8752     case T_BOOLEAN:
8753     case T_BYTE:
8754        korbl(dst, src1, src2);
8755        break;
8756     case T_CHAR:
8757     case T_SHORT:
8758        korwl(dst, src1, src2);
8759        break;
8760     case T_INT:
8761     case T_FLOAT:
8762        kordl(dst, src1, src2);
8763        break;
8764     case T_LONG:
8765     case T_DOUBLE:
8766        korql(dst, src1, src2);
8767        break;
8768     default:
8769       fatal("Unexpected type argument %s", type2name(type));
8770       break;
8771   }
8772 }
8773 
8774 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
8775   switch(type) {
8776     case T_BOOLEAN:
8777     case T_BYTE:
8778        kxorbl(dst, src1, src2);
8779        break;
8780     case T_CHAR:
8781     case T_SHORT:
8782        kxorwl(dst, src1, src2);
8783        break;
8784     case T_INT:
8785     case T_FLOAT:
8786        kxordl(dst, src1, src2);
8787        break;
8788     case T_LONG:
8789     case T_DOUBLE:
8790        kxorql(dst, src1, src2);
8791        break;
8792     default:
8793       fatal("Unexpected type argument %s", type2name(type));
8794       break;
8795   }
8796 }
8797 
8798 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8799   switch(type) {
8800     case T_BOOLEAN:
8801     case T_BYTE:
8802       evpermb(dst, mask, nds, src, merge, vector_len); break;
8803     case T_CHAR:
8804     case T_SHORT:
8805       evpermw(dst, mask, nds, src, merge, vector_len); break;
8806     case T_INT:
8807     case T_FLOAT:
8808       evpermd(dst, mask, nds, src, merge, vector_len); break;
8809     case T_LONG:
8810     case T_DOUBLE:
8811       evpermq(dst, mask, nds, src, merge, vector_len); break;
8812     default:
8813       fatal("Unexpected type argument %s", type2name(type)); break;
8814   }
8815 }
8816 
8817 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8818   switch(type) {
8819     case T_BOOLEAN:
8820     case T_BYTE:
8821       evpermb(dst, mask, nds, src, merge, vector_len); break;
8822     case T_CHAR:
8823     case T_SHORT:
8824       evpermw(dst, mask, nds, src, merge, vector_len); break;
8825     case T_INT:
8826     case T_FLOAT:
8827       evpermd(dst, mask, nds, src, merge, vector_len); break;
8828     case T_LONG:
8829     case T_DOUBLE:
8830       evpermq(dst, mask, nds, src, merge, vector_len); break;
8831     default:
8832       fatal("Unexpected type argument %s", type2name(type)); break;
8833   }
8834 }
8835 
8836 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8837   switch(type) {
8838     case T_BYTE:
8839       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8840     case T_SHORT:
8841       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8842     case T_INT:
8843       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8844     case T_LONG:
8845       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8846     default:
8847       fatal("Unexpected type argument %s", type2name(type)); break;
8848   }
8849 }
8850 
8851 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8852   switch(type) {
8853     case T_BYTE:
8854       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8855     case T_SHORT:
8856       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8857     case T_INT:
8858       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8859     case T_LONG:
8860       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8861     default:
8862       fatal("Unexpected type argument %s", type2name(type)); break;
8863   }
8864 }
8865 
8866 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8867   switch(type) {
8868     case T_BYTE:
8869       evpminsb(dst, mask, nds, src, merge, vector_len); break;
8870     case T_SHORT:
8871       evpminsw(dst, mask, nds, src, merge, vector_len); break;
8872     case T_INT:
8873       evpminsd(dst, mask, nds, src, merge, vector_len); break;
8874     case T_LONG:
8875       evpminsq(dst, mask, nds, src, merge, vector_len); break;
8876     default:
8877       fatal("Unexpected type argument %s", type2name(type)); break;
8878   }
8879 }
8880 
8881 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8882   switch(type) {
8883     case T_BYTE:
8884       evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
8885     case T_SHORT:
8886       evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
8887     case T_INT:
8888       evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
8889     case T_LONG:
8890       evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
8891     default:
8892       fatal("Unexpected type argument %s", type2name(type)); break;
8893   }
8894 }
8895 
8896 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8897   switch(type) {
8898     case T_INT:
8899       evpxord(dst, mask, nds, src, merge, vector_len); break;
8900     case T_LONG:
8901       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8902     default:
8903       fatal("Unexpected type argument %s", type2name(type)); break;
8904   }
8905 }
8906 
8907 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8908   switch(type) {
8909     case T_INT:
8910       evpxord(dst, mask, nds, src, merge, vector_len); break;
8911     case T_LONG:
8912       evpxorq(dst, mask, nds, src, merge, vector_len); break;
8913     default:
8914       fatal("Unexpected type argument %s", type2name(type)); break;
8915   }
8916 }
8917 
8918 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8919   switch(type) {
8920     case T_INT:
8921       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8922     case T_LONG:
8923       evporq(dst, mask, nds, src, merge, vector_len); break;
8924     default:
8925       fatal("Unexpected type argument %s", type2name(type)); break;
8926   }
8927 }
8928 
8929 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8930   switch(type) {
8931     case T_INT:
8932       Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
8933     case T_LONG:
8934       evporq(dst, mask, nds, src, merge, vector_len); break;
8935     default:
8936       fatal("Unexpected type argument %s", type2name(type)); break;
8937   }
8938 }
8939 
8940 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
8941   switch(type) {
8942     case T_INT:
8943       evpandd(dst, mask, nds, src, merge, vector_len); break;
8944     case T_LONG:
8945       evpandq(dst, mask, nds, src, merge, vector_len); break;
8946     default:
8947       fatal("Unexpected type argument %s", type2name(type)); break;
8948   }
8949 }
8950 
8951 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
8952   switch(type) {
8953     case T_INT:
8954       evpandd(dst, mask, nds, src, merge, vector_len); break;
8955     case T_LONG:
8956       evpandq(dst, mask, nds, src, merge, vector_len); break;
8957     default:
8958       fatal("Unexpected type argument %s", type2name(type)); break;
8959   }
8960 }
8961 
8962 void MacroAssembler::anytrue(Register dst, uint masklen, KRegister src1, KRegister src2) {
8963    masklen = masklen < 8 ? 8 : masklen;
8964    ktest(masklen, src1, src2);
8965    setb(Assembler::notZero, dst);
8966    movzbl(dst, dst);
8967 }
8968 
8969 void MacroAssembler::alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch) {
8970   if (masklen < 8) {
8971     knotbl(kscratch, src2);
8972     kortestbl(src1, kscratch);
8973     setb(Assembler::carrySet, dst);
8974     movzbl(dst, dst);
8975   } else {
8976     ktest(masklen, src1, src2);
8977     setb(Assembler::carrySet, dst);
8978     movzbl(dst, dst);
8979   }
8980 }
8981 
8982 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
8983   switch(masklen) {
8984     case 8:
8985        kortestbl(src1, src2);
8986        break;
8987     case 16:
8988        kortestwl(src1, src2);
8989        break;
8990     case 32:
8991        kortestdl(src1, src2);
8992        break;
8993     case 64:
8994        kortestql(src1, src2);
8995        break;
8996     default:
8997       fatal("Unexpected mask length %d", masklen);
8998       break;
8999   }
9000 }
9001 
9002 
9003 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9004   switch(masklen)  {
9005     case 8:
9006        ktestbl(src1, src2);
9007        break;
9008     case 16:
9009        ktestwl(src1, src2);
9010        break;
9011     case 32:
9012        ktestdl(src1, src2);
9013        break;
9014     case 64:
9015        ktestql(src1, src2);
9016        break;
9017     default:
9018       fatal("Unexpected mask length %d", masklen);
9019       break;
9020   }
9021 }
9022 
9023 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9024   switch(type) {
9025     case T_INT:
9026       evprold(dst, mask, src, shift, merge, vlen_enc); break;
9027     case T_LONG:
9028       evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9029     default:
9030       fatal("Unexpected type argument %s", type2name(type)); break;
9031       break;
9032   }
9033 }
9034 
9035 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9036   switch(type) {
9037     case T_INT:
9038       evprord(dst, mask, src, shift, merge, vlen_enc); break;
9039     case T_LONG:
9040       evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9041     default:
9042       fatal("Unexpected type argument %s", type2name(type)); break;
9043   }
9044 }
9045 
9046 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9047   switch(type) {
9048     case T_INT:
9049       evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9050     case T_LONG:
9051       evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9052     default:
9053       fatal("Unexpected type argument %s", type2name(type)); break;
9054   }
9055 }
9056 
9057 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9058   switch(type) {
9059     case T_INT:
9060       evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9061     case T_LONG:
9062       evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9063     default:
9064       fatal("Unexpected type argument %s", type2name(type)); break;
9065   }
9066 }
9067 #if COMPILER2_OR_JVMCI
9068 
9069 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9070                                  Register length, Register temp, int vec_enc) {
9071   // Computing mask for predicated vector store.
9072   movptr(temp, -1);
9073   bzhiq(temp, temp, length);
9074   kmov(mask, temp);
9075   evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9076 }
9077 
9078 // Set memory operation for length "less than" 64 bytes.
9079 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9080                                        XMMRegister xmm, KRegister mask, Register length,
9081                                        Register temp, bool use64byteVector) {
9082   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9083   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9084   if (!use64byteVector) {
9085     fill32(dst, disp, xmm);
9086     subptr(length, 32 >> shift);
9087     fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9088   } else {
9089     assert(MaxVectorSize == 64, "vector length != 64");
9090     fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9091   }
9092 }
9093 
9094 
9095 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9096                                        XMMRegister xmm, KRegister mask, Register length,
9097                                        Register temp) {
9098   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9099   BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9100   fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9101 }
9102 
9103 
9104 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9105   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9106   vmovdqu(dst, xmm);
9107 }
9108 
9109 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9110   fill32(Address(dst, disp), xmm);
9111 }
9112 
9113 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9114   assert(MaxVectorSize >= 32, "vector length should be >= 32");
9115   if (!use64byteVector) {
9116     fill32(dst, xmm);
9117     fill32(dst.plus_disp(32), xmm);
9118   } else {
9119     evmovdquq(dst, xmm, Assembler::AVX_512bit);
9120   }
9121 }
9122 
9123 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9124   fill64(Address(dst, disp), xmm, use64byteVector);
9125 }
9126 
9127 #ifdef _LP64
9128 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9129                                         Register count, Register rtmp, XMMRegister xtmp) {
9130   Label L_exit;
9131   Label L_fill_start;
9132   Label L_fill_64_bytes;
9133   Label L_fill_96_bytes;
9134   Label L_fill_128_bytes;
9135   Label L_fill_128_bytes_loop;
9136   Label L_fill_128_loop_header;
9137   Label L_fill_128_bytes_loop_header;
9138   Label L_fill_128_bytes_loop_pre_header;
9139   Label L_fill_zmm_sequence;
9140 
9141   int shift = -1;
9142   int avx3threshold = VM_Version::avx3_threshold();
9143   switch(type) {
9144     case T_BYTE:  shift = 0;
9145       break;
9146     case T_SHORT: shift = 1;
9147       break;
9148     case T_INT:   shift = 2;
9149       break;
9150     /* Uncomment when LONG fill stubs are supported.
9151     case T_LONG:  shift = 3;
9152       break;
9153     */
9154     default:
9155       fatal("Unhandled type: %s\n", type2name(type));
9156   }
9157 
9158   if ((avx3threshold != 0)  || (MaxVectorSize == 32)) {
9159 
9160     if (MaxVectorSize == 64) {
9161       cmpq(count, avx3threshold >> shift);
9162       jcc(Assembler::greater, L_fill_zmm_sequence);
9163     }
9164 
9165     evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
9166 
9167     bind(L_fill_start);
9168 
9169     cmpq(count, 32 >> shift);
9170     jccb(Assembler::greater, L_fill_64_bytes);
9171     fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
9172     jmp(L_exit);
9173 
9174     bind(L_fill_64_bytes);
9175     cmpq(count, 64 >> shift);
9176     jccb(Assembler::greater, L_fill_96_bytes);
9177     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
9178     jmp(L_exit);
9179 
9180     bind(L_fill_96_bytes);
9181     cmpq(count, 96 >> shift);
9182     jccb(Assembler::greater, L_fill_128_bytes);
9183     fill64(to, 0, xtmp);
9184     subq(count, 64 >> shift);
9185     fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
9186     jmp(L_exit);
9187 
9188     bind(L_fill_128_bytes);
9189     cmpq(count, 128 >> shift);
9190     jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
9191     fill64(to, 0, xtmp);
9192     fill32(to, 64, xtmp);
9193     subq(count, 96 >> shift);
9194     fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
9195     jmp(L_exit);
9196 
9197     bind(L_fill_128_bytes_loop_pre_header);
9198     {
9199       mov(rtmp, to);
9200       andq(rtmp, 31);
9201       jccb(Assembler::zero, L_fill_128_bytes_loop_header);
9202       negq(rtmp);
9203       addq(rtmp, 32);
9204       mov64(r8, -1L);
9205       bzhiq(r8, r8, rtmp);
9206       kmovql(k2, r8);
9207       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
9208       addq(to, rtmp);
9209       shrq(rtmp, shift);
9210       subq(count, rtmp);
9211     }
9212 
9213     cmpq(count, 128 >> shift);
9214     jcc(Assembler::less, L_fill_start);
9215 
9216     bind(L_fill_128_bytes_loop_header);
9217     subq(count, 128 >> shift);
9218 
9219     align32();
9220     bind(L_fill_128_bytes_loop);
9221       fill64(to, 0, xtmp);
9222       fill64(to, 64, xtmp);
9223       addq(to, 128);
9224       subq(count, 128 >> shift);
9225       jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9226 
9227     addq(count, 128 >> shift);
9228     jcc(Assembler::zero, L_exit);
9229     jmp(L_fill_start);
9230   }
9231 
9232   if (MaxVectorSize == 64) {
9233     // Sequence using 64 byte ZMM register.
9234     Label L_fill_128_bytes_zmm;
9235     Label L_fill_192_bytes_zmm;
9236     Label L_fill_192_bytes_loop_zmm;
9237     Label L_fill_192_bytes_loop_header_zmm;
9238     Label L_fill_192_bytes_loop_pre_header_zmm;
9239     Label L_fill_start_zmm_sequence;
9240 
9241     bind(L_fill_zmm_sequence);
9242     evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9243 
9244     bind(L_fill_start_zmm_sequence);
9245     cmpq(count, 64 >> shift);
9246     jccb(Assembler::greater, L_fill_128_bytes_zmm);
9247     fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9248     jmp(L_exit);
9249 
9250     bind(L_fill_128_bytes_zmm);
9251     cmpq(count, 128 >> shift);
9252     jccb(Assembler::greater, L_fill_192_bytes_zmm);
9253     fill64(to, 0, xtmp, true);
9254     subq(count, 64 >> shift);
9255     fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9256     jmp(L_exit);
9257 
9258     bind(L_fill_192_bytes_zmm);
9259     cmpq(count, 192 >> shift);
9260     jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9261     fill64(to, 0, xtmp, true);
9262     fill64(to, 64, xtmp, true);
9263     subq(count, 128 >> shift);
9264     fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9265     jmp(L_exit);
9266 
9267     bind(L_fill_192_bytes_loop_pre_header_zmm);
9268     {
9269       movq(rtmp, to);
9270       andq(rtmp, 63);
9271       jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9272       negq(rtmp);
9273       addq(rtmp, 64);
9274       mov64(r8, -1L);
9275       bzhiq(r8, r8, rtmp);
9276       kmovql(k2, r8);
9277       evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9278       addq(to, rtmp);
9279       shrq(rtmp, shift);
9280       subq(count, rtmp);
9281     }
9282 
9283     cmpq(count, 192 >> shift);
9284     jcc(Assembler::less, L_fill_start_zmm_sequence);
9285 
9286     bind(L_fill_192_bytes_loop_header_zmm);
9287     subq(count, 192 >> shift);
9288 
9289     align32();
9290     bind(L_fill_192_bytes_loop_zmm);
9291       fill64(to, 0, xtmp, true);
9292       fill64(to, 64, xtmp, true);
9293       fill64(to, 128, xtmp, true);
9294       addq(to, 192);
9295       subq(count, 192 >> shift);
9296       jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
9297 
9298     addq(count, 192 >> shift);
9299     jcc(Assembler::zero, L_exit);
9300     jmp(L_fill_start_zmm_sequence);
9301   }
9302   bind(L_exit);
9303 }
9304 #endif
9305 #endif //COMPILER2_OR_JVMCI
9306 
9307 
9308 #ifdef _LP64
9309 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
9310   Label done;
9311   cvttss2sil(dst, src);
9312   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9313   cmpl(dst, 0x80000000); // float_sign_flip
9314   jccb(Assembler::notEqual, done);
9315   subptr(rsp, 8);
9316   movflt(Address(rsp, 0), src);
9317   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
9318   pop(dst);
9319   bind(done);
9320 }
9321 
9322 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
9323   Label done;
9324   cvttsd2sil(dst, src);
9325   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
9326   cmpl(dst, 0x80000000); // float_sign_flip
9327   jccb(Assembler::notEqual, done);
9328   subptr(rsp, 8);
9329   movdbl(Address(rsp, 0), src);
9330   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
9331   pop(dst);
9332   bind(done);
9333 }
9334 
9335 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
9336   Label done;
9337   cvttss2siq(dst, src);
9338   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9339   jccb(Assembler::notEqual, done);
9340   subptr(rsp, 8);
9341   movflt(Address(rsp, 0), src);
9342   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
9343   pop(dst);
9344   bind(done);
9345 }
9346 
9347 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9348   // Following code is line by line assembly translation rounding algorithm.
9349   // Please refer to java.lang.Math.round(float) algorithm for details.
9350   const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
9351   const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
9352   const int32_t FloatConsts_EXP_BIAS = 127;
9353   const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
9354   const int32_t MINUS_32 = 0xFFFFFFE0;
9355   Label L_special_case, L_block1, L_exit;
9356   movl(rtmp, FloatConsts_EXP_BIT_MASK);
9357   movdl(dst, src);
9358   andl(dst, rtmp);
9359   sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
9360   movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
9361   subl(rtmp, dst);
9362   movl(rcx, rtmp);
9363   movl(dst, MINUS_32);
9364   testl(rtmp, dst);
9365   jccb(Assembler::notEqual, L_special_case);
9366   movdl(dst, src);
9367   andl(dst, FloatConsts_SIGNIF_BIT_MASK);
9368   orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
9369   movdl(rtmp, src);
9370   testl(rtmp, rtmp);
9371   jccb(Assembler::greaterEqual, L_block1);
9372   negl(dst);
9373   bind(L_block1);
9374   sarl(dst);
9375   addl(dst, 0x1);
9376   sarl(dst, 0x1);
9377   jmp(L_exit);
9378   bind(L_special_case);
9379   convert_f2i(dst, src);
9380   bind(L_exit);
9381 }
9382 
9383 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
9384   // Following code is line by line assembly translation rounding algorithm.
9385   // Please refer to java.lang.Math.round(double) algorithm for details.
9386   const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
9387   const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
9388   const int64_t DoubleConsts_EXP_BIAS = 1023;
9389   const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
9390   const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
9391   Label L_special_case, L_block1, L_exit;
9392   mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
9393   movq(dst, src);
9394   andq(dst, rtmp);
9395   sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
9396   mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
9397   subq(rtmp, dst);
9398   movq(rcx, rtmp);
9399   mov64(dst, MINUS_64);
9400   testq(rtmp, dst);
9401   jccb(Assembler::notEqual, L_special_case);
9402   movq(dst, src);
9403   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
9404   andq(dst, rtmp);
9405   mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
9406   orq(dst, rtmp);
9407   movq(rtmp, src);
9408   testq(rtmp, rtmp);
9409   jccb(Assembler::greaterEqual, L_block1);
9410   negq(dst);
9411   bind(L_block1);
9412   sarq(dst);
9413   addq(dst, 0x1);
9414   sarq(dst, 0x1);
9415   jmp(L_exit);
9416   bind(L_special_case);
9417   convert_d2l(dst, src);
9418   bind(L_exit);
9419 }
9420 
9421 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
9422   Label done;
9423   cvttsd2siq(dst, src);
9424   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
9425   jccb(Assembler::notEqual, done);
9426   subptr(rsp, 8);
9427   movdbl(Address(rsp, 0), src);
9428   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
9429   pop(dst);
9430   bind(done);
9431 }
9432 
9433 void MacroAssembler::cache_wb(Address line)
9434 {
9435   // 64 bit cpus always support clflush
9436   assert(VM_Version::supports_clflush(), "clflush should be available");
9437   bool optimized = VM_Version::supports_clflushopt();
9438   bool no_evict = VM_Version::supports_clwb();
9439 
9440   // prefer clwb (writeback without evict) otherwise
9441   // prefer clflushopt (potentially parallel writeback with evict)
9442   // otherwise fallback on clflush (serial writeback with evict)
9443 
9444   if (optimized) {
9445     if (no_evict) {
9446       clwb(line);
9447     } else {
9448       clflushopt(line);
9449     }
9450   } else {
9451     // no need for fence when using CLFLUSH
9452     clflush(line);
9453   }
9454 }
9455 
9456 void MacroAssembler::cache_wbsync(bool is_pre)
9457 {
9458   assert(VM_Version::supports_clflush(), "clflush should be available");
9459   bool optimized = VM_Version::supports_clflushopt();
9460   bool no_evict = VM_Version::supports_clwb();
9461 
9462   // pick the correct implementation
9463 
9464   if (!is_pre && (optimized || no_evict)) {
9465     // need an sfence for post flush when using clflushopt or clwb
9466     // otherwise no no need for any synchroniaztion
9467 
9468     sfence();
9469   }
9470 }
9471 
9472 #endif // _LP64
9473 
9474 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9475   switch (cond) {
9476     // Note some conditions are synonyms for others
9477     case Assembler::zero:         return Assembler::notZero;
9478     case Assembler::notZero:      return Assembler::zero;
9479     case Assembler::less:         return Assembler::greaterEqual;
9480     case Assembler::lessEqual:    return Assembler::greater;
9481     case Assembler::greater:      return Assembler::lessEqual;
9482     case Assembler::greaterEqual: return Assembler::less;
9483     case Assembler::below:        return Assembler::aboveEqual;
9484     case Assembler::belowEqual:   return Assembler::above;
9485     case Assembler::above:        return Assembler::belowEqual;
9486     case Assembler::aboveEqual:   return Assembler::below;
9487     case Assembler::overflow:     return Assembler::noOverflow;
9488     case Assembler::noOverflow:   return Assembler::overflow;
9489     case Assembler::negative:     return Assembler::positive;
9490     case Assembler::positive:     return Assembler::negative;
9491     case Assembler::parity:       return Assembler::noParity;
9492     case Assembler::noParity:     return Assembler::parity;
9493   }
9494   ShouldNotReachHere(); return Assembler::overflow;
9495 }
9496 
9497 SkipIfEqual::SkipIfEqual(
9498     MacroAssembler* masm, const bool* flag_addr, bool value) {
9499   _masm = masm;
9500   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9501   _masm->jcc(Assembler::equal, _label);
9502 }
9503 
9504 SkipIfEqual::~SkipIfEqual() {
9505   _masm->bind(_label);
9506 }
9507 
9508 // 32-bit Windows has its own fast-path implementation
9509 // of get_thread
9510 #if !defined(WIN32) || defined(_LP64)
9511 
9512 // This is simply a call to Thread::current()
9513 void MacroAssembler::get_thread(Register thread) {
9514   if (thread != rax) {
9515     push(rax);
9516   }
9517   LP64_ONLY(push(rdi);)
9518   LP64_ONLY(push(rsi);)
9519   push(rdx);
9520   push(rcx);
9521 #ifdef _LP64
9522   push(r8);
9523   push(r9);
9524   push(r10);
9525   push(r11);
9526 #endif
9527 
9528   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9529 
9530 #ifdef _LP64
9531   pop(r11);
9532   pop(r10);
9533   pop(r9);
9534   pop(r8);
9535 #endif
9536   pop(rcx);
9537   pop(rdx);
9538   LP64_ONLY(pop(rsi);)
9539   LP64_ONLY(pop(rdi);)
9540   if (thread != rax) {
9541     mov(thread, rax);
9542     pop(rax);
9543   }
9544 }
9545 
9546 
9547 #endif // !WIN32 || _LP64