1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/compiler_globals.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/resourceArea.hpp"
  38 #include "memory/universe.hpp"
  39 #include "oops/accessDecorators.hpp"
  40 #include "oops/compressedOops.inline.hpp"
  41 #include "oops/klass.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/flags/flagSetting.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.hpp"
  47 #include "runtime/objectMonitor.hpp"
  48 #include "runtime/os.hpp"
  49 #include "runtime/safepoint.hpp"
  50 #include "runtime/safepointMechanism.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "runtime/thread.hpp"
  54 #include "utilities/macros.hpp"
  55 #include "crc32c.h"
  56 
  57 #ifdef COMPILER2
  58 #include "opto/c2_CodeStubs.hpp"
  59 #include "opto/compile.hpp"
  60 #include "opto/output.hpp"
  61 #endif
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #define STOP(error) stop(error)
  66 #else
  67 #define BLOCK_COMMENT(str) block_comment(str)
  68 #define STOP(error) block_comment(error); stop(error)
  69 #endif
  70 
  71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  72 
  73 #ifdef ASSERT
  74 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  75 #endif
  76 
  77 static Assembler::Condition reverse[] = {
  78     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  79     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  80     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  81     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  82     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  83     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  84     Assembler::above          /* belowEqual    = 0x6 */ ,
  85     Assembler::belowEqual     /* above         = 0x7 */ ,
  86     Assembler::positive       /* negative      = 0x8 */ ,
  87     Assembler::negative       /* positive      = 0x9 */ ,
  88     Assembler::noParity       /* parity        = 0xa */ ,
  89     Assembler::parity         /* noParity      = 0xb */ ,
  90     Assembler::greaterEqual   /* less          = 0xc */ ,
  91     Assembler::less           /* greaterEqual  = 0xd */ ,
  92     Assembler::greater        /* lessEqual     = 0xe */ ,
  93     Assembler::lessEqual      /* greater       = 0xf, */
  94 
  95 };
  96 
  97 
  98 // Implementation of MacroAssembler
  99 
 100 // First all the versions that have distinct versions depending on 32/64 bit
 101 // Unless the difference is trivial (1 line or so).
 102 
 103 #ifndef _LP64
 104 
 105 // 32bit versions
 106 
 107 Address MacroAssembler::as_Address(AddressLiteral adr) {
 108   return Address(adr.target(), adr.rspec());
 109 }
 110 
 111 Address MacroAssembler::as_Address(ArrayAddress adr) {
 112   return Address::make_array(adr);
 113 }
 114 
 115 void MacroAssembler::call_VM_leaf_base(address entry_point,
 116                                        int number_of_arguments) {
 117   call(RuntimeAddress(entry_point));
 118   increment(rsp, number_of_arguments * wordSize);
 119 }
 120 
 121 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 122   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 123 }
 124 
 125 
 126 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 127   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 128 }
 129 
 130 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 131   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 132 }
 133 
 134 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 135   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 136 }
 137 
 138 void MacroAssembler::extend_sign(Register hi, Register lo) {
 139   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 140   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 141     cdql();
 142   } else {
 143     movl(hi, lo);
 144     sarl(hi, 31);
 145   }
 146 }
 147 
 148 void MacroAssembler::jC2(Register tmp, Label& L) {
 149   // set parity bit if FPU flag C2 is set (via rax)
 150   save_rax(tmp);
 151   fwait(); fnstsw_ax();
 152   sahf();
 153   restore_rax(tmp);
 154   // branch
 155   jcc(Assembler::parity, L);
 156 }
 157 
 158 void MacroAssembler::jnC2(Register tmp, Label& L) {
 159   // set parity bit if FPU flag C2 is set (via rax)
 160   save_rax(tmp);
 161   fwait(); fnstsw_ax();
 162   sahf();
 163   restore_rax(tmp);
 164   // branch
 165   jcc(Assembler::noParity, L);
 166 }
 167 
 168 // 32bit can do a case table jump in one instruction but we no longer allow the base
 169 // to be installed in the Address class
 170 void MacroAssembler::jump(ArrayAddress entry) {
 171   jmp(as_Address(entry));
 172 }
 173 
 174 // Note: y_lo will be destroyed
 175 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 176   // Long compare for Java (semantics as described in JVM spec.)
 177   Label high, low, done;
 178 
 179   cmpl(x_hi, y_hi);
 180   jcc(Assembler::less, low);
 181   jcc(Assembler::greater, high);
 182   // x_hi is the return register
 183   xorl(x_hi, x_hi);
 184   cmpl(x_lo, y_lo);
 185   jcc(Assembler::below, low);
 186   jcc(Assembler::equal, done);
 187 
 188   bind(high);
 189   xorl(x_hi, x_hi);
 190   increment(x_hi);
 191   jmp(done);
 192 
 193   bind(low);
 194   xorl(x_hi, x_hi);
 195   decrementl(x_hi);
 196 
 197   bind(done);
 198 }
 199 
 200 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 201     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 202 }
 203 
 204 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 205   // leal(dst, as_Address(adr));
 206   // see note in movl as to why we must use a move
 207   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 208 }
 209 
 210 void MacroAssembler::leave() {
 211   mov(rsp, rbp);
 212   pop(rbp);
 213 }
 214 
 215 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 216   // Multiplication of two Java long values stored on the stack
 217   // as illustrated below. Result is in rdx:rax.
 218   //
 219   // rsp ---> [  ??  ] \               \
 220   //            ....    | y_rsp_offset  |
 221   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 222   //          [ y_hi ]                  | (in bytes)
 223   //            ....                    |
 224   //          [ x_lo ]                 /
 225   //          [ x_hi ]
 226   //            ....
 227   //
 228   // Basic idea: lo(result) = lo(x_lo * y_lo)
 229   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 230   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 231   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 232   Label quick;
 233   // load x_hi, y_hi and check if quick
 234   // multiplication is possible
 235   movl(rbx, x_hi);
 236   movl(rcx, y_hi);
 237   movl(rax, rbx);
 238   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 239   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 240   // do full multiplication
 241   // 1st step
 242   mull(y_lo);                                    // x_hi * y_lo
 243   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 244   // 2nd step
 245   movl(rax, x_lo);
 246   mull(rcx);                                     // x_lo * y_hi
 247   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 248   // 3rd step
 249   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 250   movl(rax, x_lo);
 251   mull(y_lo);                                    // x_lo * y_lo
 252   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 253 }
 254 
 255 void MacroAssembler::lneg(Register hi, Register lo) {
 256   negl(lo);
 257   adcl(hi, 0);
 258   negl(hi);
 259 }
 260 
 261 void MacroAssembler::lshl(Register hi, Register lo) {
 262   // Java shift left long support (semantics as described in JVM spec., p.305)
 263   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 264   // shift value is in rcx !
 265   assert(hi != rcx, "must not use rcx");
 266   assert(lo != rcx, "must not use rcx");
 267   const Register s = rcx;                        // shift count
 268   const int      n = BitsPerWord;
 269   Label L;
 270   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 271   cmpl(s, n);                                    // if (s < n)
 272   jcc(Assembler::less, L);                       // else (s >= n)
 273   movl(hi, lo);                                  // x := x << n
 274   xorl(lo, lo);
 275   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 276   bind(L);                                       // s (mod n) < n
 277   shldl(hi, lo);                                 // x := x << s
 278   shll(lo);
 279 }
 280 
 281 
 282 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 283   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 284   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 285   assert(hi != rcx, "must not use rcx");
 286   assert(lo != rcx, "must not use rcx");
 287   const Register s = rcx;                        // shift count
 288   const int      n = BitsPerWord;
 289   Label L;
 290   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 291   cmpl(s, n);                                    // if (s < n)
 292   jcc(Assembler::less, L);                       // else (s >= n)
 293   movl(lo, hi);                                  // x := x >> n
 294   if (sign_extension) sarl(hi, 31);
 295   else                xorl(hi, hi);
 296   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 297   bind(L);                                       // s (mod n) < n
 298   shrdl(lo, hi);                                 // x := x >> s
 299   if (sign_extension) sarl(hi);
 300   else                shrl(hi);
 301 }
 302 
 303 void MacroAssembler::movoop(Register dst, jobject obj) {
 304   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 305 }
 306 
 307 void MacroAssembler::movoop(Address dst, jobject obj) {
 308   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 309 }
 310 
 311 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 312   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 313 }
 314 
 315 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 316   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 317 }
 318 
 319 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 320   // scratch register is not used,
 321   // it is defined to match parameters of 64-bit version of this method.
 322   if (src.is_lval()) {
 323     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 324   } else {
 325     movl(dst, as_Address(src));
 326   }
 327 }
 328 
 329 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 330   movl(as_Address(dst), src);
 331 }
 332 
 333 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 334   movl(dst, as_Address(src));
 335 }
 336 
 337 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 338 void MacroAssembler::movptr(Address dst, intptr_t src) {
 339   movl(dst, src);
 340 }
 341 
 342 
 343 void MacroAssembler::pop_callee_saved_registers() {
 344   pop(rcx);
 345   pop(rdx);
 346   pop(rdi);
 347   pop(rsi);
 348 }
 349 
 350 void MacroAssembler::push_callee_saved_registers() {
 351   push(rsi);
 352   push(rdi);
 353   push(rdx);
 354   push(rcx);
 355 }
 356 
 357 void MacroAssembler::pushoop(jobject obj) {
 358   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 359 }
 360 
 361 void MacroAssembler::pushklass(Metadata* obj) {
 362   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 363 }
 364 
 365 void MacroAssembler::pushptr(AddressLiteral src) {
 366   if (src.is_lval()) {
 367     push_literal32((int32_t)src.target(), src.rspec());
 368   } else {
 369     pushl(as_Address(src));
 370   }
 371 }
 372 
 373 static void pass_arg0(MacroAssembler* masm, Register arg) {
 374   masm->push(arg);
 375 }
 376 
 377 static void pass_arg1(MacroAssembler* masm, Register arg) {
 378   masm->push(arg);
 379 }
 380 
 381 static void pass_arg2(MacroAssembler* masm, Register arg) {
 382   masm->push(arg);
 383 }
 384 
 385 static void pass_arg3(MacroAssembler* masm, Register arg) {
 386   masm->push(arg);
 387 }
 388 
 389 #ifndef PRODUCT
 390 extern "C" void findpc(intptr_t x);
 391 #endif
 392 
 393 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 394   // In order to get locks to work, we need to fake a in_VM state
 395   JavaThread* thread = JavaThread::current();
 396   JavaThreadState saved_state = thread->thread_state();
 397   thread->set_thread_state(_thread_in_vm);
 398   if (ShowMessageBoxOnError) {
 399     JavaThread* thread = JavaThread::current();
 400     JavaThreadState saved_state = thread->thread_state();
 401     thread->set_thread_state(_thread_in_vm);
 402     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 403       ttyLocker ttyl;
 404       BytecodeCounter::print();
 405     }
 406     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 407     // This is the value of eip which points to where verify_oop will return.
 408     if (os::message_box(msg, "Execution stopped, print registers?")) {
 409       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 410       BREAKPOINT;
 411     }
 412   }
 413   fatal("DEBUG MESSAGE: %s", msg);
 414 }
 415 
 416 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 417   ttyLocker ttyl;
 418   FlagSetting fs(Debugging, true);
 419   tty->print_cr("eip = 0x%08x", eip);
 420 #ifndef PRODUCT
 421   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 422     tty->cr();
 423     findpc(eip);
 424     tty->cr();
 425   }
 426 #endif
 427 #define PRINT_REG(rax) \
 428   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 429   PRINT_REG(rax);
 430   PRINT_REG(rbx);
 431   PRINT_REG(rcx);
 432   PRINT_REG(rdx);
 433   PRINT_REG(rdi);
 434   PRINT_REG(rsi);
 435   PRINT_REG(rbp);
 436   PRINT_REG(rsp);
 437 #undef PRINT_REG
 438   // Print some words near top of staack.
 439   int* dump_sp = (int*) rsp;
 440   for (int col1 = 0; col1 < 8; col1++) {
 441     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 442     os::print_location(tty, *dump_sp++);
 443   }
 444   for (int row = 0; row < 16; row++) {
 445     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 446     for (int col = 0; col < 8; col++) {
 447       tty->print(" 0x%08x", *dump_sp++);
 448     }
 449     tty->cr();
 450   }
 451   // Print some instructions around pc:
 452   Disassembler::decode((address)eip-64, (address)eip);
 453   tty->print_cr("--------");
 454   Disassembler::decode((address)eip, (address)eip+32);
 455 }
 456 
 457 void MacroAssembler::stop(const char* msg) {
 458   ExternalAddress message((address)msg);
 459   // push address of message
 460   pushptr(message.addr());
 461   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 462   pusha();                                            // push registers
 463   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 464   hlt();
 465 }
 466 
 467 void MacroAssembler::warn(const char* msg) {
 468   push_CPU_state();
 469 
 470   ExternalAddress message((address) msg);
 471   // push address of message
 472   pushptr(message.addr());
 473 
 474   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 475   addl(rsp, wordSize);       // discard argument
 476   pop_CPU_state();
 477 }
 478 
 479 void MacroAssembler::print_state() {
 480   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 481   pusha();                                            // push registers
 482 
 483   push_CPU_state();
 484   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 485   pop_CPU_state();
 486 
 487   popa();
 488   addl(rsp, wordSize);
 489 }
 490 
 491 #else // _LP64
 492 
 493 // 64 bit versions
 494 
 495 Address MacroAssembler::as_Address(AddressLiteral adr) {
 496   // amd64 always does this as a pc-rel
 497   // we can be absolute or disp based on the instruction type
 498   // jmp/call are displacements others are absolute
 499   assert(!adr.is_lval(), "must be rval");
 500   assert(reachable(adr), "must be");
 501   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 502 
 503 }
 504 
 505 Address MacroAssembler::as_Address(ArrayAddress adr) {
 506   AddressLiteral base = adr.base();
 507   lea(rscratch1, base);
 508   Address index = adr.index();
 509   assert(index._disp == 0, "must not have disp"); // maybe it can?
 510   Address array(rscratch1, index._index, index._scale, index._disp);
 511   return array;
 512 }
 513 
 514 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 515   Label L, E;
 516 
 517 #ifdef _WIN64
 518   // Windows always allocates space for it's register args
 519   assert(num_args <= 4, "only register arguments supported");
 520   subq(rsp,  frame::arg_reg_save_area_bytes);
 521 #endif
 522 
 523   // Align stack if necessary
 524   testl(rsp, 15);
 525   jcc(Assembler::zero, L);
 526 
 527   subq(rsp, 8);
 528   {
 529     call(RuntimeAddress(entry_point));
 530   }
 531   addq(rsp, 8);
 532   jmp(E);
 533 
 534   bind(L);
 535   {
 536     call(RuntimeAddress(entry_point));
 537   }
 538 
 539   bind(E);
 540 
 541 #ifdef _WIN64
 542   // restore stack pointer
 543   addq(rsp, frame::arg_reg_save_area_bytes);
 544 #endif
 545 
 546 }
 547 
 548 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 549   assert(!src2.is_lval(), "should use cmpptr");
 550 
 551   if (reachable(src2)) {
 552     cmpq(src1, as_Address(src2));
 553   } else {
 554     lea(rscratch1, src2);
 555     Assembler::cmpq(src1, Address(rscratch1, 0));
 556   }
 557 }
 558 
 559 int MacroAssembler::corrected_idivq(Register reg) {
 560   // Full implementation of Java ldiv and lrem; checks for special
 561   // case as described in JVM spec., p.243 & p.271.  The function
 562   // returns the (pc) offset of the idivl instruction - may be needed
 563   // for implicit exceptions.
 564   //
 565   //         normal case                           special case
 566   //
 567   // input : rax: dividend                         min_long
 568   //         reg: divisor   (may not be eax/edx)   -1
 569   //
 570   // output: rax: quotient  (= rax idiv reg)       min_long
 571   //         rdx: remainder (= rax irem reg)       0
 572   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 573   static const int64_t min_long = 0x8000000000000000;
 574   Label normal_case, special_case;
 575 
 576   // check for special case
 577   cmp64(rax, ExternalAddress((address) &min_long));
 578   jcc(Assembler::notEqual, normal_case);
 579   xorl(rdx, rdx); // prepare rdx for possible special case (where
 580                   // remainder = 0)
 581   cmpq(reg, -1);
 582   jcc(Assembler::equal, special_case);
 583 
 584   // handle normal case
 585   bind(normal_case);
 586   cdqq();
 587   int idivq_offset = offset();
 588   idivq(reg);
 589 
 590   // normal and special case exit
 591   bind(special_case);
 592 
 593   return idivq_offset;
 594 }
 595 
 596 void MacroAssembler::decrementq(Register reg, int value) {
 597   if (value == min_jint) { subq(reg, value); return; }
 598   if (value <  0) { incrementq(reg, -value); return; }
 599   if (value == 0) {                        ; return; }
 600   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 601   /* else */      { subq(reg, value)       ; return; }
 602 }
 603 
 604 void MacroAssembler::decrementq(Address dst, int value) {
 605   if (value == min_jint) { subq(dst, value); return; }
 606   if (value <  0) { incrementq(dst, -value); return; }
 607   if (value == 0) {                        ; return; }
 608   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 609   /* else */      { subq(dst, value)       ; return; }
 610 }
 611 
 612 void MacroAssembler::incrementq(AddressLiteral dst) {
 613   if (reachable(dst)) {
 614     incrementq(as_Address(dst));
 615   } else {
 616     lea(rscratch1, dst);
 617     incrementq(Address(rscratch1, 0));
 618   }
 619 }
 620 
 621 void MacroAssembler::incrementq(Register reg, int value) {
 622   if (value == min_jint) { addq(reg, value); return; }
 623   if (value <  0) { decrementq(reg, -value); return; }
 624   if (value == 0) {                        ; return; }
 625   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 626   /* else */      { addq(reg, value)       ; return; }
 627 }
 628 
 629 void MacroAssembler::incrementq(Address dst, int value) {
 630   if (value == min_jint) { addq(dst, value); return; }
 631   if (value <  0) { decrementq(dst, -value); return; }
 632   if (value == 0) {                        ; return; }
 633   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 634   /* else */      { addq(dst, value)       ; return; }
 635 }
 636 
 637 // 32bit can do a case table jump in one instruction but we no longer allow the base
 638 // to be installed in the Address class
 639 void MacroAssembler::jump(ArrayAddress entry) {
 640   lea(rscratch1, entry.base());
 641   Address dispatch = entry.index();
 642   assert(dispatch._base == noreg, "must be");
 643   dispatch._base = rscratch1;
 644   jmp(dispatch);
 645 }
 646 
 647 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 648   ShouldNotReachHere(); // 64bit doesn't use two regs
 649   cmpq(x_lo, y_lo);
 650 }
 651 
 652 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 653     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 654 }
 655 
 656 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 657   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 658   movptr(dst, rscratch1);
 659 }
 660 
 661 void MacroAssembler::leave() {
 662   // %%% is this really better? Why not on 32bit too?
 663   emit_int8((unsigned char)0xC9); // LEAVE
 664 }
 665 
 666 void MacroAssembler::lneg(Register hi, Register lo) {
 667   ShouldNotReachHere(); // 64bit doesn't use two regs
 668   negq(lo);
 669 }
 670 
 671 void MacroAssembler::movoop(Register dst, jobject obj) {
 672   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 673 }
 674 
 675 void MacroAssembler::movoop(Address dst, jobject obj) {
 676   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 677   movq(dst, rscratch1);
 678 }
 679 
 680 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 681   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 682 }
 683 
 684 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 685   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 686   movq(dst, rscratch1);
 687 }
 688 
 689 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 690   if (src.is_lval()) {
 691     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 692   } else {
 693     if (reachable(src)) {
 694       movq(dst, as_Address(src));
 695     } else {
 696       lea(scratch, src);
 697       movq(dst, Address(scratch, 0));
 698     }
 699   }
 700 }
 701 
 702 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 703   movq(as_Address(dst), src);
 704 }
 705 
 706 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 707   movq(dst, as_Address(src));
 708 }
 709 
 710 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 711 void MacroAssembler::movptr(Address dst, intptr_t src) {
 712   if (is_simm32(src)) {
 713     movptr(dst, checked_cast<int32_t>(src));
 714   } else {
 715     mov64(rscratch1, src);
 716     movq(dst, rscratch1);
 717   }
 718 }
 719 
 720 // These are mostly for initializing NULL
 721 void MacroAssembler::movptr(Address dst, int32_t src) {
 722   movslq(dst, src);
 723 }
 724 
 725 void MacroAssembler::movptr(Register dst, int32_t src) {
 726   mov64(dst, (intptr_t)src);
 727 }
 728 
 729 void MacroAssembler::pushoop(jobject obj) {
 730   movoop(rscratch1, obj);
 731   push(rscratch1);
 732 }
 733 
 734 void MacroAssembler::pushklass(Metadata* obj) {
 735   mov_metadata(rscratch1, obj);
 736   push(rscratch1);
 737 }
 738 
 739 void MacroAssembler::pushptr(AddressLiteral src) {
 740   lea(rscratch1, src);
 741   if (src.is_lval()) {
 742     push(rscratch1);
 743   } else {
 744     pushq(Address(rscratch1, 0));
 745   }
 746 }
 747 
 748 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 749   reset_last_Java_frame(r15_thread, clear_fp);
 750 }
 751 
 752 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 753                                          Register last_java_fp,
 754                                          address  last_java_pc) {
 755   vzeroupper();
 756   // determine last_java_sp register
 757   if (!last_java_sp->is_valid()) {
 758     last_java_sp = rsp;
 759   }
 760 
 761   // last_java_fp is optional
 762   if (last_java_fp->is_valid()) {
 763     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 764            last_java_fp);
 765   }
 766 
 767   // last_java_pc is optional
 768   if (last_java_pc != NULL) {
 769     Address java_pc(r15_thread,
 770                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 771     lea(rscratch1, InternalAddress(last_java_pc));
 772     movptr(java_pc, rscratch1);
 773   }
 774 
 775   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 776 }
 777 
 778 static void pass_arg0(MacroAssembler* masm, Register arg) {
 779   if (c_rarg0 != arg ) {
 780     masm->mov(c_rarg0, arg);
 781   }
 782 }
 783 
 784 static void pass_arg1(MacroAssembler* masm, Register arg) {
 785   if (c_rarg1 != arg ) {
 786     masm->mov(c_rarg1, arg);
 787   }
 788 }
 789 
 790 static void pass_arg2(MacroAssembler* masm, Register arg) {
 791   if (c_rarg2 != arg ) {
 792     masm->mov(c_rarg2, arg);
 793   }
 794 }
 795 
 796 static void pass_arg3(MacroAssembler* masm, Register arg) {
 797   if (c_rarg3 != arg ) {
 798     masm->mov(c_rarg3, arg);
 799   }
 800 }
 801 
 802 void MacroAssembler::stop(const char* msg) {
 803   if (ShowMessageBoxOnError) {
 804     address rip = pc();
 805     pusha(); // get regs on stack
 806     lea(c_rarg1, InternalAddress(rip));
 807     movq(c_rarg2, rsp); // pass pointer to regs array
 808   }
 809   lea(c_rarg0, ExternalAddress((address) msg));
 810   andq(rsp, -16); // align stack as required by ABI
 811   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 812   hlt();
 813 }
 814 
 815 void MacroAssembler::warn(const char* msg) {
 816   push(rbp);
 817   movq(rbp, rsp);
 818   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 819   push_CPU_state();   // keeps alignment at 16 bytes
 820   lea(c_rarg0, ExternalAddress((address) msg));
 821   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 822   call(rax);
 823   pop_CPU_state();
 824   mov(rsp, rbp);
 825   pop(rbp);
 826 }
 827 
 828 void MacroAssembler::print_state() {
 829   address rip = pc();
 830   pusha();            // get regs on stack
 831   push(rbp);
 832   movq(rbp, rsp);
 833   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 834   push_CPU_state();   // keeps alignment at 16 bytes
 835 
 836   lea(c_rarg0, InternalAddress(rip));
 837   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 838   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 839 
 840   pop_CPU_state();
 841   mov(rsp, rbp);
 842   pop(rbp);
 843   popa();
 844 }
 845 
 846 #ifndef PRODUCT
 847 extern "C" void findpc(intptr_t x);
 848 #endif
 849 
 850 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 851   // In order to get locks to work, we need to fake a in_VM state
 852   if (ShowMessageBoxOnError) {
 853     JavaThread* thread = JavaThread::current();
 854     JavaThreadState saved_state = thread->thread_state();
 855     thread->set_thread_state(_thread_in_vm);
 856 #ifndef PRODUCT
 857     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 858       ttyLocker ttyl;
 859       BytecodeCounter::print();
 860     }
 861 #endif
 862     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 863     // XXX correct this offset for amd64
 864     // This is the value of eip which points to where verify_oop will return.
 865     if (os::message_box(msg, "Execution stopped, print registers?")) {
 866       print_state64(pc, regs);
 867       BREAKPOINT;
 868     }
 869   }
 870   fatal("DEBUG MESSAGE: %s", msg);
 871 }
 872 
 873 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 874   ttyLocker ttyl;
 875   FlagSetting fs(Debugging, true);
 876   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 877 #ifndef PRODUCT
 878   tty->cr();
 879   findpc(pc);
 880   tty->cr();
 881 #endif
 882 #define PRINT_REG(rax, value) \
 883   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 884   PRINT_REG(rax, regs[15]);
 885   PRINT_REG(rbx, regs[12]);
 886   PRINT_REG(rcx, regs[14]);
 887   PRINT_REG(rdx, regs[13]);
 888   PRINT_REG(rdi, regs[8]);
 889   PRINT_REG(rsi, regs[9]);
 890   PRINT_REG(rbp, regs[10]);
 891   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 892   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 893   PRINT_REG(r8 , regs[7]);
 894   PRINT_REG(r9 , regs[6]);
 895   PRINT_REG(r10, regs[5]);
 896   PRINT_REG(r11, regs[4]);
 897   PRINT_REG(r12, regs[3]);
 898   PRINT_REG(r13, regs[2]);
 899   PRINT_REG(r14, regs[1]);
 900   PRINT_REG(r15, regs[0]);
 901 #undef PRINT_REG
 902   // Print some words near the top of the stack.
 903   int64_t* rsp = &regs[16];
 904   int64_t* dump_sp = rsp;
 905   for (int col1 = 0; col1 < 8; col1++) {
 906     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 907     os::print_location(tty, *dump_sp++);
 908   }
 909   for (int row = 0; row < 25; row++) {
 910     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 911     for (int col = 0; col < 4; col++) {
 912       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 913     }
 914     tty->cr();
 915   }
 916   // Print some instructions around pc:
 917   Disassembler::decode((address)pc-64, (address)pc);
 918   tty->print_cr("--------");
 919   Disassembler::decode((address)pc, (address)pc+32);
 920 }
 921 
 922 // The java_calling_convention describes stack locations as ideal slots on
 923 // a frame with no abi restrictions. Since we must observe abi restrictions
 924 // (like the placement of the register window) the slots must be biased by
 925 // the following value.
 926 static int reg2offset_in(VMReg r) {
 927   // Account for saved rbp and return address
 928   // This should really be in_preserve_stack_slots
 929   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 930 }
 931 
 932 static int reg2offset_out(VMReg r) {
 933   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 934 }
 935 
 936 // A long move
 937 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst) {
 938 
 939   // The calling conventions assures us that each VMregpair is either
 940   // all really one physical register or adjacent stack slots.
 941 
 942   if (src.is_single_phys_reg() ) {
 943     if (dst.is_single_phys_reg()) {
 944       if (dst.first() != src.first()) {
 945         mov(dst.first()->as_Register(), src.first()->as_Register());
 946       }
 947     } else {
 948       assert(dst.is_single_reg(), "not a stack pair");
 949       movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
 950     }
 951   } else if (dst.is_single_phys_reg()) {
 952     assert(src.is_single_reg(),  "not a stack pair");
 953     movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
 954   } else {
 955     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 956     movq(rax, Address(rbp, reg2offset_in(src.first())));
 957     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 958   }
 959 }
 960 
 961 // A double move
 962 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst) {
 963 
 964   // The calling conventions assures us that each VMregpair is either
 965   // all really one physical register or adjacent stack slots.
 966 
 967   if (src.is_single_phys_reg() ) {
 968     if (dst.is_single_phys_reg()) {
 969       // In theory these overlap but the ordering is such that this is likely a nop
 970       if ( src.first() != dst.first()) {
 971         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 972       }
 973     } else {
 974       assert(dst.is_single_reg(), "not a stack pair");
 975       movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
 976     }
 977   } else if (dst.is_single_phys_reg()) {
 978     assert(src.is_single_reg(),  "not a stack pair");
 979     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
 980   } else {
 981     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 982     movq(rax, Address(rbp, reg2offset_in(src.first())));
 983     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 984   }
 985 }
 986 
 987 
 988 // A float arg may have to do float reg int reg conversion
 989 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst) {
 990   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 991 
 992   // The calling conventions assures us that each VMregpair is either
 993   // all really one physical register or adjacent stack slots.
 994 
 995   if (src.first()->is_stack()) {
 996     if (dst.first()->is_stack()) {
 997       movl(rax, Address(rbp, reg2offset_in(src.first())));
 998       movptr(Address(rsp, reg2offset_out(dst.first())), rax);
 999     } else {
1000       // stack to reg
1001       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1002       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
1003     }
1004   } else if (dst.first()->is_stack()) {
1005     // reg to stack
1006     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1007     movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1008   } else {
1009     // reg to reg
1010     // In theory these overlap but the ordering is such that this is likely a nop
1011     if ( src.first() != dst.first()) {
1012       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
1013     }
1014   }
1015 }
1016 
1017 // On 64 bit we will store integer like items to the stack as
1018 // 64 bits items (x86_32/64 abi) even though java would only store
1019 // 32bits for a parameter. On 32bit it will simply be 32 bits
1020 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
1021 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst) {
1022   if (src.first()->is_stack()) {
1023     if (dst.first()->is_stack()) {
1024       // stack to stack
1025       movslq(rax, Address(rbp, reg2offset_in(src.first())));
1026       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1027     } else {
1028       // stack to reg
1029       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1030     }
1031   } else if (dst.first()->is_stack()) {
1032     // reg to stack
1033     // Do we really have to sign extend???
1034     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1035     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1036   } else {
1037     // Do we really have to sign extend???
1038     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1039     if (dst.first() != src.first()) {
1040       movq(dst.first()->as_Register(), src.first()->as_Register());
1041     }
1042   }
1043 }
1044 
1045 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1046   if (src.first()->is_stack()) {
1047     if (dst.first()->is_stack()) {
1048       // stack to stack
1049       movq(rax, Address(rbp, reg2offset_in(src.first())));
1050       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1051     } else {
1052       // stack to reg
1053       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1054     }
1055   } else if (dst.first()->is_stack()) {
1056     // reg to stack
1057     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1058   } else {
1059     if (dst.first() != src.first()) {
1060       movq(dst.first()->as_Register(), src.first()->as_Register());
1061     }
1062   }
1063 }
1064 
1065 // An oop arg. Must pass a handle not the oop itself
1066 void MacroAssembler::object_move(OopMap* map,
1067                         int oop_handle_offset,
1068                         int framesize_in_slots,
1069                         VMRegPair src,
1070                         VMRegPair dst,
1071                         bool is_receiver,
1072                         int* receiver_offset) {
1073 
1074   // must pass a handle. First figure out the location we use as a handle
1075 
1076   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1077 
1078   // See if oop is NULL if it is we need no handle
1079 
1080   if (src.first()->is_stack()) {
1081 
1082     // Oop is already on the stack as an argument
1083     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1084     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1085     if (is_receiver) {
1086       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1087     }
1088 
1089     cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1090     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1091     // conditionally move a NULL
1092     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1093   } else {
1094 
1095     // Oop is in an a register we must store it to the space we reserve
1096     // on the stack for oop_handles and pass a handle if oop is non-NULL
1097 
1098     const Register rOop = src.first()->as_Register();
1099     int oop_slot;
1100     if (rOop == j_rarg0)
1101       oop_slot = 0;
1102     else if (rOop == j_rarg1)
1103       oop_slot = 1;
1104     else if (rOop == j_rarg2)
1105       oop_slot = 2;
1106     else if (rOop == j_rarg3)
1107       oop_slot = 3;
1108     else if (rOop == j_rarg4)
1109       oop_slot = 4;
1110     else {
1111       assert(rOop == j_rarg5, "wrong register");
1112       oop_slot = 5;
1113     }
1114 
1115     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1116     int offset = oop_slot*VMRegImpl::stack_slot_size;
1117 
1118     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1119     // Store oop in handle area, may be NULL
1120     movptr(Address(rsp, offset), rOop);
1121     if (is_receiver) {
1122       *receiver_offset = offset;
1123     }
1124 
1125     cmpptr(rOop, (int32_t)NULL_WORD);
1126     lea(rHandle, Address(rsp, offset));
1127     // conditionally move a NULL from the handle area where it was just stored
1128     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1129   }
1130 
1131   // If arg is on the stack then place it otherwise it is already in correct reg.
1132   if (dst.first()->is_stack()) {
1133     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1134   }
1135 }
1136 
1137 #endif // _LP64
1138 
1139 // Now versions that are common to 32/64 bit
1140 
1141 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1142   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1143 }
1144 
1145 void MacroAssembler::addptr(Register dst, Register src) {
1146   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1147 }
1148 
1149 void MacroAssembler::addptr(Address dst, Register src) {
1150   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1151 }
1152 
1153 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1154   if (reachable(src)) {
1155     Assembler::addsd(dst, as_Address(src));
1156   } else {
1157     lea(rscratch1, src);
1158     Assembler::addsd(dst, Address(rscratch1, 0));
1159   }
1160 }
1161 
1162 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1163   if (reachable(src)) {
1164     addss(dst, as_Address(src));
1165   } else {
1166     lea(rscratch1, src);
1167     addss(dst, Address(rscratch1, 0));
1168   }
1169 }
1170 
1171 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1172   if (reachable(src)) {
1173     Assembler::addpd(dst, as_Address(src));
1174   } else {
1175     lea(rscratch1, src);
1176     Assembler::addpd(dst, Address(rscratch1, 0));
1177   }
1178 }
1179 
1180 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1181 // Stub code is generated once and never copied.
1182 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1183 void MacroAssembler::align64() {
1184   align(64, (unsigned long long) pc());
1185 }
1186 
1187 void MacroAssembler::align32() {
1188   align(32, (unsigned long long) pc());
1189 }
1190 
1191 void MacroAssembler::align(int modulus) {
1192   // 8273459: Ensure alignment is possible with current segment alignment
1193   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1194   align(modulus, offset());
1195 }
1196 
1197 void MacroAssembler::align(int modulus, int target) {
1198   if (target % modulus != 0) {
1199     nop(modulus - (target % modulus));
1200   }
1201 }
1202 
1203 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1204   // Used in sign-masking with aligned address.
1205   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1206   if (reachable(src)) {
1207     Assembler::andpd(dst, as_Address(src));
1208   } else {
1209     lea(scratch_reg, src);
1210     Assembler::andpd(dst, Address(scratch_reg, 0));
1211   }
1212 }
1213 
1214 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1215   // Used in sign-masking with aligned address.
1216   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1217   if (reachable(src)) {
1218     Assembler::andps(dst, as_Address(src));
1219   } else {
1220     lea(scratch_reg, src);
1221     Assembler::andps(dst, Address(scratch_reg, 0));
1222   }
1223 }
1224 
1225 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1226   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1227 }
1228 
1229 void MacroAssembler::atomic_incl(Address counter_addr) {
1230   lock();
1231   incrementl(counter_addr);
1232 }
1233 
1234 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1235   if (reachable(counter_addr)) {
1236     atomic_incl(as_Address(counter_addr));
1237   } else {
1238     lea(scr, counter_addr);
1239     atomic_incl(Address(scr, 0));
1240   }
1241 }
1242 
1243 #ifdef _LP64
1244 void MacroAssembler::atomic_incq(Address counter_addr) {
1245   lock();
1246   incrementq(counter_addr);
1247 }
1248 
1249 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1250   if (reachable(counter_addr)) {
1251     atomic_incq(as_Address(counter_addr));
1252   } else {
1253     lea(scr, counter_addr);
1254     atomic_incq(Address(scr, 0));
1255   }
1256 }
1257 #endif
1258 
1259 // Writes to stack successive pages until offset reached to check for
1260 // stack overflow + shadow pages.  This clobbers tmp.
1261 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1262   movptr(tmp, rsp);
1263   // Bang stack for total size given plus shadow page size.
1264   // Bang one page at a time because large size can bang beyond yellow and
1265   // red zones.
1266   Label loop;
1267   bind(loop);
1268   movl(Address(tmp, (-os::vm_page_size())), size );
1269   subptr(tmp, os::vm_page_size());
1270   subl(size, os::vm_page_size());
1271   jcc(Assembler::greater, loop);
1272 
1273   // Bang down shadow pages too.
1274   // At this point, (tmp-0) is the last address touched, so don't
1275   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1276   // was post-decremented.)  Skip this address by starting at i=1, and
1277   // touch a few more pages below.  N.B.  It is important to touch all
1278   // the way down including all pages in the shadow zone.
1279   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1280     // this could be any sized move but this is can be a debugging crumb
1281     // so the bigger the better.
1282     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1283   }
1284 }
1285 
1286 void MacroAssembler::reserved_stack_check() {
1287     // testing if reserved zone needs to be enabled
1288     Label no_reserved_zone_enabling;
1289     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1290     NOT_LP64(get_thread(rsi);)
1291 
1292     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1293     jcc(Assembler::below, no_reserved_zone_enabling);
1294 
1295     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1296     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1297     should_not_reach_here();
1298 
1299     bind(no_reserved_zone_enabling);
1300 }
1301 
1302 void MacroAssembler::biased_locking_enter(Register lock_reg,
1303                                           Register obj_reg,
1304                                           Register swap_reg,
1305                                           Register tmp_reg,
1306                                           Register tmp_reg2,
1307                                           bool swap_reg_contains_mark,
1308                                           Label& done,
1309                                           Label* slow_case,
1310                                           BiasedLockingCounters* counters) {
1311   assert(UseBiasedLocking, "why call this otherwise?");
1312   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1313   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1314   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1315   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
1316   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1317   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1318 
1319   if (PrintBiasedLockingStatistics && counters == NULL) {
1320     counters = BiasedLocking::counters();
1321   }
1322   // Biased locking
1323   // See whether the lock is currently biased toward our thread and
1324   // whether the epoch is still valid
1325   // Note that the runtime guarantees sufficient alignment of JavaThread
1326   // pointers to allow age to be placed into low bits
1327   // First check to see whether biasing is even enabled for this object
1328   Label cas_label;
1329   if (!swap_reg_contains_mark) {
1330     movptr(swap_reg, mark_addr);
1331   }
1332   movptr(tmp_reg, swap_reg);
1333   andptr(tmp_reg, markWord::biased_lock_mask_in_place);
1334   cmpptr(tmp_reg, markWord::biased_lock_pattern);
1335   jcc(Assembler::notEqual, cas_label);
1336   // The bias pattern is present in the object's header. Need to check
1337   // whether the bias owner and the epoch are both still current.
1338 #ifndef _LP64
1339   // Note that because there is no current thread register on x86_32 we
1340   // need to store off the mark word we read out of the object to
1341   // avoid reloading it and needing to recheck invariants below. This
1342   // store is unfortunate but it makes the overall code shorter and
1343   // simpler.
1344   movptr(saved_mark_addr, swap_reg);
1345 #endif
1346   load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1347 #ifdef _LP64
1348   orptr(tmp_reg, r15_thread);
1349   xorptr(tmp_reg, swap_reg);
1350   Register header_reg = tmp_reg;
1351 #else
1352   xorptr(tmp_reg, swap_reg);
1353   get_thread(swap_reg);
1354   xorptr(swap_reg, tmp_reg);
1355   Register header_reg = swap_reg;
1356 #endif
1357   andptr(header_reg, ~((int) markWord::age_mask_in_place));
1358   if (counters != NULL) {
1359     cond_inc32(Assembler::zero,
1360                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1361   }
1362   jcc(Assembler::equal, done);
1363 
1364   Label try_revoke_bias;
1365   Label try_rebias;
1366 
1367   // At this point we know that the header has the bias pattern and
1368   // that we are not the bias owner in the current epoch. We need to
1369   // figure out more details about the state of the header in order to
1370   // know what operations can be legally performed on the object's
1371   // header.
1372 
1373   // If the low three bits in the xor result aren't clear, that means
1374   // the prototype header is no longer biased and we have to revoke
1375   // the bias on this object.
1376   testptr(header_reg, markWord::biased_lock_mask_in_place);
1377   jcc(Assembler::notZero, try_revoke_bias);
1378 
1379   // Biasing is still enabled for this data type. See whether the
1380   // epoch of the current bias is still valid, meaning that the epoch
1381   // bits of the mark word are equal to the epoch bits of the
1382   // prototype header. (Note that the prototype header's epoch bits
1383   // only change at a safepoint.) If not, attempt to rebias the object
1384   // toward the current thread. Note that we must be absolutely sure
1385   // that the current epoch is invalid in order to do this because
1386   // otherwise the manipulations it performs on the mark word are
1387   // illegal.
1388   testptr(header_reg, markWord::epoch_mask_in_place);
1389   jccb(Assembler::notZero, try_rebias);
1390 
1391   // The epoch of the current bias is still valid but we know nothing
1392   // about the owner; it might be set or it might be clear. Try to
1393   // acquire the bias of the object using an atomic operation. If this
1394   // fails we will go in to the runtime to revoke the object's bias.
1395   // Note that we first construct the presumed unbiased header so we
1396   // don't accidentally blow away another thread's valid bias.
1397   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1398   andptr(swap_reg,
1399          markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
1400 #ifdef _LP64
1401   movptr(tmp_reg, swap_reg);
1402   orptr(tmp_reg, r15_thread);
1403 #else
1404   get_thread(tmp_reg);
1405   orptr(tmp_reg, swap_reg);
1406 #endif
1407   lock();
1408   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1409   // If the biasing toward our thread failed, this means that
1410   // another thread succeeded in biasing it toward itself and we
1411   // need to revoke that bias. The revocation will occur in the
1412   // interpreter runtime in the slow case.
1413   if (counters != NULL) {
1414     cond_inc32(Assembler::zero,
1415                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1416   }
1417   if (slow_case != NULL) {
1418     jcc(Assembler::notZero, *slow_case);
1419   }
1420   jmp(done);
1421 
1422   bind(try_rebias);
1423   // At this point we know the epoch has expired, meaning that the
1424   // current "bias owner", if any, is actually invalid. Under these
1425   // circumstances _only_, we are allowed to use the current header's
1426   // value as the comparison value when doing the cas to acquire the
1427   // bias in the current epoch. In other words, we allow transfer of
1428   // the bias from one thread to another directly in this situation.
1429   //
1430   // FIXME: due to a lack of registers we currently blow away the age
1431   // bits in this situation. Should attempt to preserve them.
1432   load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1433 #ifdef _LP64
1434   orptr(tmp_reg, r15_thread);
1435 #else
1436   get_thread(swap_reg);
1437   orptr(tmp_reg, swap_reg);
1438   movptr(swap_reg, saved_mark_addr);
1439 #endif
1440   lock();
1441   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1442   // If the biasing toward our thread failed, then another thread
1443   // succeeded in biasing it toward itself and we need to revoke that
1444   // bias. The revocation will occur in the runtime in the slow case.
1445   if (counters != NULL) {
1446     cond_inc32(Assembler::zero,
1447                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1448   }
1449   if (slow_case != NULL) {
1450     jcc(Assembler::notZero, *slow_case);
1451   }
1452   jmp(done);
1453 
1454   bind(try_revoke_bias);
1455   // The prototype mark in the klass doesn't have the bias bit set any
1456   // more, indicating that objects of this data type are not supposed
1457   // to be biased any more. We are going to try to reset the mark of
1458   // this object to the prototype value and fall through to the
1459   // CAS-based locking scheme. Note that if our CAS fails, it means
1460   // that another thread raced us for the privilege of revoking the
1461   // bias of this particular object, so it's okay to continue in the
1462   // normal locking code.
1463   //
1464   // FIXME: due to a lack of registers we currently blow away the age
1465   // bits in this situation. Should attempt to preserve them.
1466   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1467   load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1468   lock();
1469   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1470   // Fall through to the normal CAS-based lock, because no matter what
1471   // the result of the above CAS, some thread must have succeeded in
1472   // removing the bias bit from the object's header.
1473   if (counters != NULL) {
1474     cond_inc32(Assembler::zero,
1475                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1476   }
1477 
1478   bind(cas_label);
1479 }
1480 
1481 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1482   assert(UseBiasedLocking, "why call this otherwise?");
1483 
1484   // Check for biased locking unlock case, which is a no-op
1485   // Note: we do not have to check the thread ID for two reasons.
1486   // First, the interpreter checks for IllegalMonitorStateException at
1487   // a higher level. Second, if the bias was revoked while we held the
1488   // lock, the object could not be rebiased toward another thread, so
1489   // the bias bit would be clear.
1490   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1491   andptr(temp_reg, markWord::biased_lock_mask_in_place);
1492   cmpptr(temp_reg, markWord::biased_lock_pattern);
1493   jcc(Assembler::equal, done);
1494 }
1495 
1496 void MacroAssembler::c2bool(Register x) {
1497   // implements x == 0 ? 0 : 1
1498   // note: must only look at least-significant byte of x
1499   //       since C-style booleans are stored in one byte
1500   //       only! (was bug)
1501   andl(x, 0xFF);
1502   setb(Assembler::notZero, x);
1503 }
1504 
1505 // Wouldn't need if AddressLiteral version had new name
1506 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1507   Assembler::call(L, rtype);
1508 }
1509 
1510 void MacroAssembler::call(Register entry) {
1511   Assembler::call(entry);
1512 }
1513 
1514 void MacroAssembler::call(AddressLiteral entry) {
1515   if (reachable(entry)) {
1516     Assembler::call_literal(entry.target(), entry.rspec());
1517   } else {
1518     lea(rscratch1, entry);
1519     Assembler::call(rscratch1);
1520   }
1521 }
1522 
1523 void MacroAssembler::ic_call(address entry, jint method_index) {
1524   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1525   movptr(rax, (intptr_t)Universe::non_oop_word());
1526   call(AddressLiteral(entry, rh));
1527 }
1528 
1529 // Implementation of call_VM versions
1530 
1531 void MacroAssembler::call_VM(Register oop_result,
1532                              address entry_point,
1533                              bool check_exceptions) {
1534   Label C, E;
1535   call(C, relocInfo::none);
1536   jmp(E);
1537 
1538   bind(C);
1539   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1540   ret(0);
1541 
1542   bind(E);
1543 }
1544 
1545 void MacroAssembler::call_VM(Register oop_result,
1546                              address entry_point,
1547                              Register arg_1,
1548                              bool check_exceptions) {
1549   Label C, E;
1550   call(C, relocInfo::none);
1551   jmp(E);
1552 
1553   bind(C);
1554   pass_arg1(this, arg_1);
1555   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1556   ret(0);
1557 
1558   bind(E);
1559 }
1560 
1561 void MacroAssembler::call_VM(Register oop_result,
1562                              address entry_point,
1563                              Register arg_1,
1564                              Register arg_2,
1565                              bool check_exceptions) {
1566   Label C, E;
1567   call(C, relocInfo::none);
1568   jmp(E);
1569 
1570   bind(C);
1571 
1572   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1573 
1574   pass_arg2(this, arg_2);
1575   pass_arg1(this, arg_1);
1576   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1577   ret(0);
1578 
1579   bind(E);
1580 }
1581 
1582 void MacroAssembler::call_VM(Register oop_result,
1583                              address entry_point,
1584                              Register arg_1,
1585                              Register arg_2,
1586                              Register arg_3,
1587                              bool check_exceptions) {
1588   Label C, E;
1589   call(C, relocInfo::none);
1590   jmp(E);
1591 
1592   bind(C);
1593 
1594   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1595   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1596   pass_arg3(this, arg_3);
1597 
1598   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1599   pass_arg2(this, arg_2);
1600 
1601   pass_arg1(this, arg_1);
1602   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1603   ret(0);
1604 
1605   bind(E);
1606 }
1607 
1608 void MacroAssembler::call_VM(Register oop_result,
1609                              Register last_java_sp,
1610                              address entry_point,
1611                              int number_of_arguments,
1612                              bool check_exceptions) {
1613   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1614   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1615 }
1616 
1617 void MacroAssembler::call_VM(Register oop_result,
1618                              Register last_java_sp,
1619                              address entry_point,
1620                              Register arg_1,
1621                              bool check_exceptions) {
1622   pass_arg1(this, arg_1);
1623   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1624 }
1625 
1626 void MacroAssembler::call_VM(Register oop_result,
1627                              Register last_java_sp,
1628                              address entry_point,
1629                              Register arg_1,
1630                              Register arg_2,
1631                              bool check_exceptions) {
1632 
1633   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1634   pass_arg2(this, arg_2);
1635   pass_arg1(this, arg_1);
1636   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1637 }
1638 
1639 void MacroAssembler::call_VM(Register oop_result,
1640                              Register last_java_sp,
1641                              address entry_point,
1642                              Register arg_1,
1643                              Register arg_2,
1644                              Register arg_3,
1645                              bool check_exceptions) {
1646   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1647   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1648   pass_arg3(this, arg_3);
1649   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1650   pass_arg2(this, arg_2);
1651   pass_arg1(this, arg_1);
1652   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1653 }
1654 
1655 void MacroAssembler::super_call_VM(Register oop_result,
1656                                    Register last_java_sp,
1657                                    address entry_point,
1658                                    int number_of_arguments,
1659                                    bool check_exceptions) {
1660   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1661   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1662 }
1663 
1664 void MacroAssembler::super_call_VM(Register oop_result,
1665                                    Register last_java_sp,
1666                                    address entry_point,
1667                                    Register arg_1,
1668                                    bool check_exceptions) {
1669   pass_arg1(this, arg_1);
1670   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1671 }
1672 
1673 void MacroAssembler::super_call_VM(Register oop_result,
1674                                    Register last_java_sp,
1675                                    address entry_point,
1676                                    Register arg_1,
1677                                    Register arg_2,
1678                                    bool check_exceptions) {
1679 
1680   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1681   pass_arg2(this, arg_2);
1682   pass_arg1(this, arg_1);
1683   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1684 }
1685 
1686 void MacroAssembler::super_call_VM(Register oop_result,
1687                                    Register last_java_sp,
1688                                    address entry_point,
1689                                    Register arg_1,
1690                                    Register arg_2,
1691                                    Register arg_3,
1692                                    bool check_exceptions) {
1693   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1694   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1695   pass_arg3(this, arg_3);
1696   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1697   pass_arg2(this, arg_2);
1698   pass_arg1(this, arg_1);
1699   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1700 }
1701 
1702 void MacroAssembler::call_VM_base(Register oop_result,
1703                                   Register java_thread,
1704                                   Register last_java_sp,
1705                                   address  entry_point,
1706                                   int      number_of_arguments,
1707                                   bool     check_exceptions) {
1708   // determine java_thread register
1709   if (!java_thread->is_valid()) {
1710 #ifdef _LP64
1711     java_thread = r15_thread;
1712 #else
1713     java_thread = rdi;
1714     get_thread(java_thread);
1715 #endif // LP64
1716   }
1717   // determine last_java_sp register
1718   if (!last_java_sp->is_valid()) {
1719     last_java_sp = rsp;
1720   }
1721   // debugging support
1722   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1723   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1724 #ifdef ASSERT
1725   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1726   // r12 is the heapbase.
1727   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1728 #endif // ASSERT
1729 
1730   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1731   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1732 
1733   // push java thread (becomes first argument of C function)
1734 
1735   NOT_LP64(push(java_thread); number_of_arguments++);
1736   LP64_ONLY(mov(c_rarg0, r15_thread));
1737 
1738   // set last Java frame before call
1739   assert(last_java_sp != rbp, "can't use ebp/rbp");
1740 
1741   // Only interpreter should have to set fp
1742   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1743 
1744   // do the call, remove parameters
1745   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1746 
1747   // restore the thread (cannot use the pushed argument since arguments
1748   // may be overwritten by C code generated by an optimizing compiler);
1749   // however can use the register value directly if it is callee saved.
1750   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1751     // rdi & rsi (also r15) are callee saved -> nothing to do
1752 #ifdef ASSERT
1753     guarantee(java_thread != rax, "change this code");
1754     push(rax);
1755     { Label L;
1756       get_thread(rax);
1757       cmpptr(java_thread, rax);
1758       jcc(Assembler::equal, L);
1759       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1760       bind(L);
1761     }
1762     pop(rax);
1763 #endif
1764   } else {
1765     get_thread(java_thread);
1766   }
1767   // reset last Java frame
1768   // Only interpreter should have to clear fp
1769   reset_last_Java_frame(java_thread, true);
1770 
1771    // C++ interp handles this in the interpreter
1772   check_and_handle_popframe(java_thread);
1773   check_and_handle_earlyret(java_thread);
1774 
1775   if (check_exceptions) {
1776     // check for pending exceptions (java_thread is set upon return)
1777     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1778 #ifndef _LP64
1779     jump_cc(Assembler::notEqual,
1780             RuntimeAddress(StubRoutines::forward_exception_entry()));
1781 #else
1782     // This used to conditionally jump to forward_exception however it is
1783     // possible if we relocate that the branch will not reach. So we must jump
1784     // around so we can always reach
1785 
1786     Label ok;
1787     jcc(Assembler::equal, ok);
1788     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1789     bind(ok);
1790 #endif // LP64
1791   }
1792 
1793   // get oop result if there is one and reset the value in the thread
1794   if (oop_result->is_valid()) {
1795     get_vm_result(oop_result, java_thread);
1796   }
1797 }
1798 
1799 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1800 
1801   // Calculate the value for last_Java_sp
1802   // somewhat subtle. call_VM does an intermediate call
1803   // which places a return address on the stack just under the
1804   // stack pointer as the user finsihed with it. This allows
1805   // use to retrieve last_Java_pc from last_Java_sp[-1].
1806   // On 32bit we then have to push additional args on the stack to accomplish
1807   // the actual requested call. On 64bit call_VM only can use register args
1808   // so the only extra space is the return address that call_VM created.
1809   // This hopefully explains the calculations here.
1810 
1811 #ifdef _LP64
1812   // We've pushed one address, correct last_Java_sp
1813   lea(rax, Address(rsp, wordSize));
1814 #else
1815   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1816 #endif // LP64
1817 
1818   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1819 
1820 }
1821 
1822 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1823 void MacroAssembler::call_VM_leaf0(address entry_point) {
1824   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1825 }
1826 
1827 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1828   call_VM_leaf_base(entry_point, number_of_arguments);
1829 }
1830 
1831 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1832   pass_arg0(this, arg_0);
1833   call_VM_leaf(entry_point, 1);
1834 }
1835 
1836 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1837 
1838   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1839   pass_arg1(this, arg_1);
1840   pass_arg0(this, arg_0);
1841   call_VM_leaf(entry_point, 2);
1842 }
1843 
1844 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1845   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1846   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1847   pass_arg2(this, arg_2);
1848   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1849   pass_arg1(this, arg_1);
1850   pass_arg0(this, arg_0);
1851   call_VM_leaf(entry_point, 3);
1852 }
1853 
1854 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1855   pass_arg0(this, arg_0);
1856   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1857 }
1858 
1859 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1860 
1861   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1862   pass_arg1(this, arg_1);
1863   pass_arg0(this, arg_0);
1864   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1865 }
1866 
1867 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1868   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1869   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1870   pass_arg2(this, arg_2);
1871   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1872   pass_arg1(this, arg_1);
1873   pass_arg0(this, arg_0);
1874   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1875 }
1876 
1877 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1878   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1879   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1880   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1881   pass_arg3(this, arg_3);
1882   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1883   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1884   pass_arg2(this, arg_2);
1885   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1886   pass_arg1(this, arg_1);
1887   pass_arg0(this, arg_0);
1888   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1889 }
1890 
1891 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1892   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1893   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1894   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1895 }
1896 
1897 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1898   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1899   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1900 }
1901 
1902 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1903 }
1904 
1905 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1906 }
1907 
1908 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1909   if (reachable(src1)) {
1910     cmpl(as_Address(src1), imm);
1911   } else {
1912     lea(rscratch1, src1);
1913     cmpl(Address(rscratch1, 0), imm);
1914   }
1915 }
1916 
1917 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1918   assert(!src2.is_lval(), "use cmpptr");
1919   if (reachable(src2)) {
1920     cmpl(src1, as_Address(src2));
1921   } else {
1922     lea(rscratch1, src2);
1923     cmpl(src1, Address(rscratch1, 0));
1924   }
1925 }
1926 
1927 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1928   Assembler::cmpl(src1, imm);
1929 }
1930 
1931 void MacroAssembler::cmp32(Register src1, Address src2) {
1932   Assembler::cmpl(src1, src2);
1933 }
1934 
1935 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1936   ucomisd(opr1, opr2);
1937 
1938   Label L;
1939   if (unordered_is_less) {
1940     movl(dst, -1);
1941     jcc(Assembler::parity, L);
1942     jcc(Assembler::below , L);
1943     movl(dst, 0);
1944     jcc(Assembler::equal , L);
1945     increment(dst);
1946   } else { // unordered is greater
1947     movl(dst, 1);
1948     jcc(Assembler::parity, L);
1949     jcc(Assembler::above , L);
1950     movl(dst, 0);
1951     jcc(Assembler::equal , L);
1952     decrementl(dst);
1953   }
1954   bind(L);
1955 }
1956 
1957 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1958   ucomiss(opr1, opr2);
1959 
1960   Label L;
1961   if (unordered_is_less) {
1962     movl(dst, -1);
1963     jcc(Assembler::parity, L);
1964     jcc(Assembler::below , L);
1965     movl(dst, 0);
1966     jcc(Assembler::equal , L);
1967     increment(dst);
1968   } else { // unordered is greater
1969     movl(dst, 1);
1970     jcc(Assembler::parity, L);
1971     jcc(Assembler::above , L);
1972     movl(dst, 0);
1973     jcc(Assembler::equal , L);
1974     decrementl(dst);
1975   }
1976   bind(L);
1977 }
1978 
1979 
1980 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1981   if (reachable(src1)) {
1982     cmpb(as_Address(src1), imm);
1983   } else {
1984     lea(rscratch1, src1);
1985     cmpb(Address(rscratch1, 0), imm);
1986   }
1987 }
1988 
1989 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1990 #ifdef _LP64
1991   if (src2.is_lval()) {
1992     movptr(rscratch1, src2);
1993     Assembler::cmpq(src1, rscratch1);
1994   } else if (reachable(src2)) {
1995     cmpq(src1, as_Address(src2));
1996   } else {
1997     lea(rscratch1, src2);
1998     Assembler::cmpq(src1, Address(rscratch1, 0));
1999   }
2000 #else
2001   if (src2.is_lval()) {
2002     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2003   } else {
2004     cmpl(src1, as_Address(src2));
2005   }
2006 #endif // _LP64
2007 }
2008 
2009 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2010   assert(src2.is_lval(), "not a mem-mem compare");
2011 #ifdef _LP64
2012   // moves src2's literal address
2013   movptr(rscratch1, src2);
2014   Assembler::cmpq(src1, rscratch1);
2015 #else
2016   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2017 #endif // _LP64
2018 }
2019 
2020 void MacroAssembler::cmpoop(Register src1, Register src2) {
2021   cmpptr(src1, src2);
2022 }
2023 
2024 void MacroAssembler::cmpoop(Register src1, Address src2) {
2025   cmpptr(src1, src2);
2026 }
2027 
2028 #ifdef _LP64
2029 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2030   movoop(rscratch1, src2);
2031   cmpptr(src1, rscratch1);
2032 }
2033 #endif
2034 
2035 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2036   if (reachable(adr)) {
2037     lock();
2038     cmpxchgptr(reg, as_Address(adr));
2039   } else {
2040     lea(rscratch1, adr);
2041     lock();
2042     cmpxchgptr(reg, Address(rscratch1, 0));
2043   }
2044 }
2045 
2046 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2047   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2048 }
2049 
2050 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2051   if (reachable(src)) {
2052     Assembler::comisd(dst, as_Address(src));
2053   } else {
2054     lea(rscratch1, src);
2055     Assembler::comisd(dst, Address(rscratch1, 0));
2056   }
2057 }
2058 
2059 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2060   if (reachable(src)) {
2061     Assembler::comiss(dst, as_Address(src));
2062   } else {
2063     lea(rscratch1, src);
2064     Assembler::comiss(dst, Address(rscratch1, 0));
2065   }
2066 }
2067 
2068 
2069 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2070   Condition negated_cond = negate_condition(cond);
2071   Label L;
2072   jcc(negated_cond, L);
2073   pushf(); // Preserve flags
2074   atomic_incl(counter_addr);
2075   popf();
2076   bind(L);
2077 }
2078 
2079 int MacroAssembler::corrected_idivl(Register reg) {
2080   // Full implementation of Java idiv and irem; checks for
2081   // special case as described in JVM spec., p.243 & p.271.
2082   // The function returns the (pc) offset of the idivl
2083   // instruction - may be needed for implicit exceptions.
2084   //
2085   //         normal case                           special case
2086   //
2087   // input : rax,: dividend                         min_int
2088   //         reg: divisor   (may not be rax,/rdx)   -1
2089   //
2090   // output: rax,: quotient  (= rax, idiv reg)       min_int
2091   //         rdx: remainder (= rax, irem reg)       0
2092   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2093   const int min_int = 0x80000000;
2094   Label normal_case, special_case;
2095 
2096   // check for special case
2097   cmpl(rax, min_int);
2098   jcc(Assembler::notEqual, normal_case);
2099   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2100   cmpl(reg, -1);
2101   jcc(Assembler::equal, special_case);
2102 
2103   // handle normal case
2104   bind(normal_case);
2105   cdql();
2106   int idivl_offset = offset();
2107   idivl(reg);
2108 
2109   // normal and special case exit
2110   bind(special_case);
2111 
2112   return idivl_offset;
2113 }
2114 
2115 
2116 
2117 void MacroAssembler::decrementl(Register reg, int value) {
2118   if (value == min_jint) {subl(reg, value) ; return; }
2119   if (value <  0) { incrementl(reg, -value); return; }
2120   if (value == 0) {                        ; return; }
2121   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2122   /* else */      { subl(reg, value)       ; return; }
2123 }
2124 
2125 void MacroAssembler::decrementl(Address dst, int value) {
2126   if (value == min_jint) {subl(dst, value) ; return; }
2127   if (value <  0) { incrementl(dst, -value); return; }
2128   if (value == 0) {                        ; return; }
2129   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2130   /* else */      { subl(dst, value)       ; return; }
2131 }
2132 
2133 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2134   assert (shift_value > 0, "illegal shift value");
2135   Label _is_positive;
2136   testl (reg, reg);
2137   jcc (Assembler::positive, _is_positive);
2138   int offset = (1 << shift_value) - 1 ;
2139 
2140   if (offset == 1) {
2141     incrementl(reg);
2142   } else {
2143     addl(reg, offset);
2144   }
2145 
2146   bind (_is_positive);
2147   sarl(reg, shift_value);
2148 }
2149 
2150 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2151   if (reachable(src)) {
2152     Assembler::divsd(dst, as_Address(src));
2153   } else {
2154     lea(rscratch1, src);
2155     Assembler::divsd(dst, Address(rscratch1, 0));
2156   }
2157 }
2158 
2159 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2160   if (reachable(src)) {
2161     Assembler::divss(dst, as_Address(src));
2162   } else {
2163     lea(rscratch1, src);
2164     Assembler::divss(dst, Address(rscratch1, 0));
2165   }
2166 }
2167 
2168 void MacroAssembler::enter() {
2169   push(rbp);
2170   mov(rbp, rsp);
2171 }
2172 
2173 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2174 void MacroAssembler::fat_nop() {
2175   if (UseAddressNop) {
2176     addr_nop_5();
2177   } else {
2178     emit_int8(0x26); // es:
2179     emit_int8(0x2e); // cs:
2180     emit_int8(0x64); // fs:
2181     emit_int8(0x65); // gs:
2182     emit_int8((unsigned char)0x90);
2183   }
2184 }
2185 
2186 #ifndef _LP64
2187 void MacroAssembler::fcmp(Register tmp) {
2188   fcmp(tmp, 1, true, true);
2189 }
2190 
2191 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2192   assert(!pop_right || pop_left, "usage error");
2193   if (VM_Version::supports_cmov()) {
2194     assert(tmp == noreg, "unneeded temp");
2195     if (pop_left) {
2196       fucomip(index);
2197     } else {
2198       fucomi(index);
2199     }
2200     if (pop_right) {
2201       fpop();
2202     }
2203   } else {
2204     assert(tmp != noreg, "need temp");
2205     if (pop_left) {
2206       if (pop_right) {
2207         fcompp();
2208       } else {
2209         fcomp(index);
2210       }
2211     } else {
2212       fcom(index);
2213     }
2214     // convert FPU condition into eflags condition via rax,
2215     save_rax(tmp);
2216     fwait(); fnstsw_ax();
2217     sahf();
2218     restore_rax(tmp);
2219   }
2220   // condition codes set as follows:
2221   //
2222   // CF (corresponds to C0) if x < y
2223   // PF (corresponds to C2) if unordered
2224   // ZF (corresponds to C3) if x = y
2225 }
2226 
2227 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2228   fcmp2int(dst, unordered_is_less, 1, true, true);
2229 }
2230 
2231 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2232   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2233   Label L;
2234   if (unordered_is_less) {
2235     movl(dst, -1);
2236     jcc(Assembler::parity, L);
2237     jcc(Assembler::below , L);
2238     movl(dst, 0);
2239     jcc(Assembler::equal , L);
2240     increment(dst);
2241   } else { // unordered is greater
2242     movl(dst, 1);
2243     jcc(Assembler::parity, L);
2244     jcc(Assembler::above , L);
2245     movl(dst, 0);
2246     jcc(Assembler::equal , L);
2247     decrementl(dst);
2248   }
2249   bind(L);
2250 }
2251 
2252 void MacroAssembler::fld_d(AddressLiteral src) {
2253   fld_d(as_Address(src));
2254 }
2255 
2256 void MacroAssembler::fld_s(AddressLiteral src) {
2257   fld_s(as_Address(src));
2258 }
2259 
2260 void MacroAssembler::fldcw(AddressLiteral src) {
2261   Assembler::fldcw(as_Address(src));
2262 }
2263 
2264 void MacroAssembler::fpop() {
2265   ffree();
2266   fincstp();
2267 }
2268 
2269 void MacroAssembler::fremr(Register tmp) {
2270   save_rax(tmp);
2271   { Label L;
2272     bind(L);
2273     fprem();
2274     fwait(); fnstsw_ax();
2275     sahf();
2276     jcc(Assembler::parity, L);
2277   }
2278   restore_rax(tmp);
2279   // Result is in ST0.
2280   // Note: fxch & fpop to get rid of ST1
2281   // (otherwise FPU stack could overflow eventually)
2282   fxch(1);
2283   fpop();
2284 }
2285 
2286 void MacroAssembler::empty_FPU_stack() {
2287   if (VM_Version::supports_mmx()) {
2288     emms();
2289   } else {
2290     for (int i = 8; i-- > 0; ) ffree(i);
2291   }
2292 }
2293 #endif // !LP64
2294 
2295 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2296   if (reachable(src)) {
2297     Assembler::mulpd(dst, as_Address(src));
2298   } else {
2299     lea(rscratch1, src);
2300     Assembler::mulpd(dst, Address(rscratch1, 0));
2301   }
2302 }
2303 
2304 void MacroAssembler::load_float(Address src) {
2305 #ifdef _LP64
2306   movflt(xmm0, src);
2307 #else
2308   if (UseSSE >= 1) {
2309     movflt(xmm0, src);
2310   } else {
2311     fld_s(src);
2312   }
2313 #endif // LP64
2314 }
2315 
2316 void MacroAssembler::store_float(Address dst) {
2317 #ifdef _LP64
2318   movflt(dst, xmm0);
2319 #else
2320   if (UseSSE >= 1) {
2321     movflt(dst, xmm0);
2322   } else {
2323     fstp_s(dst);
2324   }
2325 #endif // LP64
2326 }
2327 
2328 void MacroAssembler::load_double(Address src) {
2329 #ifdef _LP64
2330   movdbl(xmm0, src);
2331 #else
2332   if (UseSSE >= 2) {
2333     movdbl(xmm0, src);
2334   } else {
2335     fld_d(src);
2336   }
2337 #endif // LP64
2338 }
2339 
2340 void MacroAssembler::store_double(Address dst) {
2341 #ifdef _LP64
2342   movdbl(dst, xmm0);
2343 #else
2344   if (UseSSE >= 2) {
2345     movdbl(dst, xmm0);
2346   } else {
2347     fstp_d(dst);
2348   }
2349 #endif // LP64
2350 }
2351 
2352 // dst = c = a * b + c
2353 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2354   Assembler::vfmadd231sd(c, a, b);
2355   if (dst != c) {
2356     movdbl(dst, c);
2357   }
2358 }
2359 
2360 // dst = c = a * b + c
2361 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2362   Assembler::vfmadd231ss(c, a, b);
2363   if (dst != c) {
2364     movflt(dst, c);
2365   }
2366 }
2367 
2368 // dst = c = a * b + c
2369 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2370   Assembler::vfmadd231pd(c, a, b, vector_len);
2371   if (dst != c) {
2372     vmovdqu(dst, c);
2373   }
2374 }
2375 
2376 // dst = c = a * b + c
2377 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2378   Assembler::vfmadd231ps(c, a, b, vector_len);
2379   if (dst != c) {
2380     vmovdqu(dst, c);
2381   }
2382 }
2383 
2384 // dst = c = a * b + c
2385 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2386   Assembler::vfmadd231pd(c, a, b, vector_len);
2387   if (dst != c) {
2388     vmovdqu(dst, c);
2389   }
2390 }
2391 
2392 // dst = c = a * b + c
2393 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2394   Assembler::vfmadd231ps(c, a, b, vector_len);
2395   if (dst != c) {
2396     vmovdqu(dst, c);
2397   }
2398 }
2399 
2400 void MacroAssembler::incrementl(AddressLiteral dst) {
2401   if (reachable(dst)) {
2402     incrementl(as_Address(dst));
2403   } else {
2404     lea(rscratch1, dst);
2405     incrementl(Address(rscratch1, 0));
2406   }
2407 }
2408 
2409 void MacroAssembler::incrementl(ArrayAddress dst) {
2410   incrementl(as_Address(dst));
2411 }
2412 
2413 void MacroAssembler::incrementl(Register reg, int value) {
2414   if (value == min_jint) {addl(reg, value) ; return; }
2415   if (value <  0) { decrementl(reg, -value); return; }
2416   if (value == 0) {                        ; return; }
2417   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2418   /* else */      { addl(reg, value)       ; return; }
2419 }
2420 
2421 void MacroAssembler::incrementl(Address dst, int value) {
2422   if (value == min_jint) {addl(dst, value) ; return; }
2423   if (value <  0) { decrementl(dst, -value); return; }
2424   if (value == 0) {                        ; return; }
2425   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2426   /* else */      { addl(dst, value)       ; return; }
2427 }
2428 
2429 void MacroAssembler::jump(AddressLiteral dst) {
2430   if (reachable(dst)) {
2431     jmp_literal(dst.target(), dst.rspec());
2432   } else {
2433     lea(rscratch1, dst);
2434     jmp(rscratch1);
2435   }
2436 }
2437 
2438 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2439   if (reachable(dst)) {
2440     InstructionMark im(this);
2441     relocate(dst.reloc());
2442     const int short_size = 2;
2443     const int long_size = 6;
2444     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2445     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2446       // 0111 tttn #8-bit disp
2447       emit_int8(0x70 | cc);
2448       emit_int8((offs - short_size) & 0xFF);
2449     } else {
2450       // 0000 1111 1000 tttn #32-bit disp
2451       emit_int8(0x0F);
2452       emit_int8((unsigned char)(0x80 | cc));
2453       emit_int32(offs - long_size);
2454     }
2455   } else {
2456 #ifdef ASSERT
2457     warning("reversing conditional branch");
2458 #endif /* ASSERT */
2459     Label skip;
2460     jccb(reverse[cc], skip);
2461     lea(rscratch1, dst);
2462     Assembler::jmp(rscratch1);
2463     bind(skip);
2464   }
2465 }
2466 
2467 void MacroAssembler::fld_x(AddressLiteral src) {
2468   Assembler::fld_x(as_Address(src));
2469 }
2470 
2471 void MacroAssembler::ldmxcsr(AddressLiteral src) {
2472   if (reachable(src)) {
2473     Assembler::ldmxcsr(as_Address(src));
2474   } else {
2475     lea(rscratch1, src);
2476     Assembler::ldmxcsr(Address(rscratch1, 0));
2477   }
2478 }
2479 
2480 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2481   int off;
2482   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2483     off = offset();
2484     movsbl(dst, src); // movsxb
2485   } else {
2486     off = load_unsigned_byte(dst, src);
2487     shll(dst, 24);
2488     sarl(dst, 24);
2489   }
2490   return off;
2491 }
2492 
2493 // Note: load_signed_short used to be called load_signed_word.
2494 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2495 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2496 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2497 int MacroAssembler::load_signed_short(Register dst, Address src) {
2498   int off;
2499   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2500     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2501     // version but this is what 64bit has always done. This seems to imply
2502     // that users are only using 32bits worth.
2503     off = offset();
2504     movswl(dst, src); // movsxw
2505   } else {
2506     off = load_unsigned_short(dst, src);
2507     shll(dst, 16);
2508     sarl(dst, 16);
2509   }
2510   return off;
2511 }
2512 
2513 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2514   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2515   // and "3.9 Partial Register Penalties", p. 22).
2516   int off;
2517   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2518     off = offset();
2519     movzbl(dst, src); // movzxb
2520   } else {
2521     xorl(dst, dst);
2522     off = offset();
2523     movb(dst, src);
2524   }
2525   return off;
2526 }
2527 
2528 // Note: load_unsigned_short used to be called load_unsigned_word.
2529 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2530   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2531   // and "3.9 Partial Register Penalties", p. 22).
2532   int off;
2533   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2534     off = offset();
2535     movzwl(dst, src); // movzxw
2536   } else {
2537     xorl(dst, dst);
2538     off = offset();
2539     movw(dst, src);
2540   }
2541   return off;
2542 }
2543 
2544 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2545   switch (size_in_bytes) {
2546 #ifndef _LP64
2547   case  8:
2548     assert(dst2 != noreg, "second dest register required");
2549     movl(dst,  src);
2550     movl(dst2, src.plus_disp(BytesPerInt));
2551     break;
2552 #else
2553   case  8:  movq(dst, src); break;
2554 #endif
2555   case  4:  movl(dst, src); break;
2556   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2557   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2558   default:  ShouldNotReachHere();
2559   }
2560 }
2561 
2562 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2563   switch (size_in_bytes) {
2564 #ifndef _LP64
2565   case  8:
2566     assert(src2 != noreg, "second source register required");
2567     movl(dst,                        src);
2568     movl(dst.plus_disp(BytesPerInt), src2);
2569     break;
2570 #else
2571   case  8:  movq(dst, src); break;
2572 #endif
2573   case  4:  movl(dst, src); break;
2574   case  2:  movw(dst, src); break;
2575   case  1:  movb(dst, src); break;
2576   default:  ShouldNotReachHere();
2577   }
2578 }
2579 
2580 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2581   if (reachable(dst)) {
2582     movl(as_Address(dst), src);
2583   } else {
2584     lea(rscratch1, dst);
2585     movl(Address(rscratch1, 0), src);
2586   }
2587 }
2588 
2589 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2590   if (reachable(src)) {
2591     movl(dst, as_Address(src));
2592   } else {
2593     lea(rscratch1, src);
2594     movl(dst, Address(rscratch1, 0));
2595   }
2596 }
2597 
2598 // C++ bool manipulation
2599 
2600 void MacroAssembler::movbool(Register dst, Address src) {
2601   if(sizeof(bool) == 1)
2602     movb(dst, src);
2603   else if(sizeof(bool) == 2)
2604     movw(dst, src);
2605   else if(sizeof(bool) == 4)
2606     movl(dst, src);
2607   else
2608     // unsupported
2609     ShouldNotReachHere();
2610 }
2611 
2612 void MacroAssembler::movbool(Address dst, bool boolconst) {
2613   if(sizeof(bool) == 1)
2614     movb(dst, (int) boolconst);
2615   else if(sizeof(bool) == 2)
2616     movw(dst, (int) boolconst);
2617   else if(sizeof(bool) == 4)
2618     movl(dst, (int) boolconst);
2619   else
2620     // unsupported
2621     ShouldNotReachHere();
2622 }
2623 
2624 void MacroAssembler::movbool(Address dst, Register src) {
2625   if(sizeof(bool) == 1)
2626     movb(dst, src);
2627   else if(sizeof(bool) == 2)
2628     movw(dst, src);
2629   else if(sizeof(bool) == 4)
2630     movl(dst, src);
2631   else
2632     // unsupported
2633     ShouldNotReachHere();
2634 }
2635 
2636 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2637   movb(as_Address(dst), src);
2638 }
2639 
2640 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2641   if (reachable(src)) {
2642     movdl(dst, as_Address(src));
2643   } else {
2644     lea(rscratch1, src);
2645     movdl(dst, Address(rscratch1, 0));
2646   }
2647 }
2648 
2649 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2650   if (reachable(src)) {
2651     movq(dst, as_Address(src));
2652   } else {
2653     lea(rscratch1, src);
2654     movq(dst, Address(rscratch1, 0));
2655   }
2656 }
2657 
2658 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2659   if (reachable(src)) {
2660     if (UseXmmLoadAndClearUpper) {
2661       movsd (dst, as_Address(src));
2662     } else {
2663       movlpd(dst, as_Address(src));
2664     }
2665   } else {
2666     lea(rscratch1, src);
2667     if (UseXmmLoadAndClearUpper) {
2668       movsd (dst, Address(rscratch1, 0));
2669     } else {
2670       movlpd(dst, Address(rscratch1, 0));
2671     }
2672   }
2673 }
2674 
2675 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2676   if (reachable(src)) {
2677     movss(dst, as_Address(src));
2678   } else {
2679     lea(rscratch1, src);
2680     movss(dst, Address(rscratch1, 0));
2681   }
2682 }
2683 
2684 void MacroAssembler::movptr(Register dst, Register src) {
2685   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2686 }
2687 
2688 void MacroAssembler::movptr(Register dst, Address src) {
2689   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2690 }
2691 
2692 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2693 void MacroAssembler::movptr(Register dst, intptr_t src) {
2694   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2695 }
2696 
2697 void MacroAssembler::movptr(Address dst, Register src) {
2698   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2699 }
2700 
2701 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2702     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2703     Assembler::movdqu(dst, src);
2704 }
2705 
2706 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2707     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2708     Assembler::movdqu(dst, src);
2709 }
2710 
2711 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2712     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2713     Assembler::movdqu(dst, src);
2714 }
2715 
2716 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2717   if (reachable(src)) {
2718     movdqu(dst, as_Address(src));
2719   } else {
2720     lea(scratchReg, src);
2721     movdqu(dst, Address(scratchReg, 0));
2722   }
2723 }
2724 
2725 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2726     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2727     Assembler::vmovdqu(dst, src);
2728 }
2729 
2730 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2731     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2732     Assembler::vmovdqu(dst, src);
2733 }
2734 
2735 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2736     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2737     Assembler::vmovdqu(dst, src);
2738 }
2739 
2740 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2741   if (reachable(src)) {
2742     vmovdqu(dst, as_Address(src));
2743   }
2744   else {
2745     lea(scratch_reg, src);
2746     vmovdqu(dst, Address(scratch_reg, 0));
2747   }
2748 }
2749 
2750 void MacroAssembler::kmov(KRegister dst, Address src) {
2751   if (VM_Version::supports_avx512bw()) {
2752     kmovql(dst, src);
2753   } else {
2754     assert(VM_Version::supports_evex(), "");
2755     kmovwl(dst, src);
2756   }
2757 }
2758 
2759 void MacroAssembler::kmov(Address dst, KRegister src) {
2760   if (VM_Version::supports_avx512bw()) {
2761     kmovql(dst, src);
2762   } else {
2763     assert(VM_Version::supports_evex(), "");
2764     kmovwl(dst, src);
2765   }
2766 }
2767 
2768 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2769   if (VM_Version::supports_avx512bw()) {
2770     kmovql(dst, src);
2771   } else {
2772     assert(VM_Version::supports_evex(), "");
2773     kmovwl(dst, src);
2774   }
2775 }
2776 
2777 void MacroAssembler::kmov(Register dst, KRegister src) {
2778   if (VM_Version::supports_avx512bw()) {
2779     kmovql(dst, src);
2780   } else {
2781     assert(VM_Version::supports_evex(), "");
2782     kmovwl(dst, src);
2783   }
2784 }
2785 
2786 void MacroAssembler::kmov(KRegister dst, Register src) {
2787   if (VM_Version::supports_avx512bw()) {
2788     kmovql(dst, src);
2789   } else {
2790     assert(VM_Version::supports_evex(), "");
2791     kmovwl(dst, src);
2792   }
2793 }
2794 
2795 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2796   if (reachable(src)) {
2797     kmovql(dst, as_Address(src));
2798   } else {
2799     lea(scratch_reg, src);
2800     kmovql(dst, Address(scratch_reg, 0));
2801   }
2802 }
2803 
2804 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2805   if (reachable(src)) {
2806     kmovwl(dst, as_Address(src));
2807   } else {
2808     lea(scratch_reg, src);
2809     kmovwl(dst, Address(scratch_reg, 0));
2810   }
2811 }
2812 
2813 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2814                                int vector_len, Register scratch_reg) {
2815   if (reachable(src)) {
2816     if (mask == k0) {
2817       Assembler::evmovdqub(dst, as_Address(src), merge, vector_len);
2818     } else {
2819       Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2820     }
2821   } else {
2822     lea(scratch_reg, src);
2823     if (mask == k0) {
2824       Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len);
2825     } else {
2826       Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2827     }
2828   }
2829 }
2830 
2831 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2832                                int vector_len, Register scratch_reg) {
2833   if (reachable(src)) {
2834     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2835   } else {
2836     lea(scratch_reg, src);
2837     Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2838   }
2839 }
2840 
2841 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2842                                int vector_len, Register scratch_reg) {
2843   if (reachable(src)) {
2844     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2845   } else {
2846     lea(scratch_reg, src);
2847     Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2848   }
2849 }
2850 
2851 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2852                                int vector_len, Register scratch_reg) {
2853   if (reachable(src)) {
2854     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2855   } else {
2856     lea(scratch_reg, src);
2857     Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2858   }
2859 }
2860 
2861 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2862   if (reachable(src)) {
2863     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2864   } else {
2865     lea(rscratch, src);
2866     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2867   }
2868 }
2869 
2870 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2871   if (reachable(src)) {
2872     Assembler::movdqa(dst, as_Address(src));
2873   } else {
2874     lea(rscratch1, src);
2875     Assembler::movdqa(dst, Address(rscratch1, 0));
2876   }
2877 }
2878 
2879 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2880   if (reachable(src)) {
2881     Assembler::movsd(dst, as_Address(src));
2882   } else {
2883     lea(rscratch1, src);
2884     Assembler::movsd(dst, Address(rscratch1, 0));
2885   }
2886 }
2887 
2888 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2889   if (reachable(src)) {
2890     Assembler::movss(dst, as_Address(src));
2891   } else {
2892     lea(rscratch1, src);
2893     Assembler::movss(dst, Address(rscratch1, 0));
2894   }
2895 }
2896 
2897 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2898   if (reachable(src)) {
2899     Assembler::mulsd(dst, as_Address(src));
2900   } else {
2901     lea(rscratch1, src);
2902     Assembler::mulsd(dst, Address(rscratch1, 0));
2903   }
2904 }
2905 
2906 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2907   if (reachable(src)) {
2908     Assembler::mulss(dst, as_Address(src));
2909   } else {
2910     lea(rscratch1, src);
2911     Assembler::mulss(dst, Address(rscratch1, 0));
2912   }
2913 }
2914 
2915 void MacroAssembler::null_check(Register reg, int offset) {
2916   if (needs_explicit_null_check(offset)) {
2917     // provoke OS NULL exception if reg = NULL by
2918     // accessing M[reg] w/o changing any (non-CC) registers
2919     // NOTE: cmpl is plenty here to provoke a segv
2920     cmpptr(rax, Address(reg, 0));
2921     // Note: should probably use testl(rax, Address(reg, 0));
2922     //       may be shorter code (however, this version of
2923     //       testl needs to be implemented first)
2924   } else {
2925     // nothing to do, (later) access of M[reg + offset]
2926     // will provoke OS NULL exception if reg = NULL
2927   }
2928 }
2929 
2930 void MacroAssembler::os_breakpoint() {
2931   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2932   // (e.g., MSVC can't call ps() otherwise)
2933   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2934 }
2935 
2936 void MacroAssembler::unimplemented(const char* what) {
2937   const char* buf = NULL;
2938   {
2939     ResourceMark rm;
2940     stringStream ss;
2941     ss.print("unimplemented: %s", what);
2942     buf = code_string(ss.as_string());
2943   }
2944   stop(buf);
2945 }
2946 
2947 #ifdef _LP64
2948 #define XSTATE_BV 0x200
2949 #endif
2950 
2951 void MacroAssembler::pop_CPU_state() {
2952   pop_FPU_state();
2953   pop_IU_state();
2954 }
2955 
2956 void MacroAssembler::pop_FPU_state() {
2957 #ifndef _LP64
2958   frstor(Address(rsp, 0));
2959 #else
2960   fxrstor(Address(rsp, 0));
2961 #endif
2962   addptr(rsp, FPUStateSizeInWords * wordSize);
2963 }
2964 
2965 void MacroAssembler::pop_IU_state() {
2966   popa();
2967   LP64_ONLY(addq(rsp, 8));
2968   popf();
2969 }
2970 
2971 // Save Integer and Float state
2972 // Warning: Stack must be 16 byte aligned (64bit)
2973 void MacroAssembler::push_CPU_state() {
2974   push_IU_state();
2975   push_FPU_state();
2976 }
2977 
2978 void MacroAssembler::push_FPU_state() {
2979   subptr(rsp, FPUStateSizeInWords * wordSize);
2980 #ifndef _LP64
2981   fnsave(Address(rsp, 0));
2982   fwait();
2983 #else
2984   fxsave(Address(rsp, 0));
2985 #endif // LP64
2986 }
2987 
2988 void MacroAssembler::push_IU_state() {
2989   // Push flags first because pusha kills them
2990   pushf();
2991   // Make sure rsp stays 16-byte aligned
2992   LP64_ONLY(subq(rsp, 8));
2993   pusha();
2994 }
2995 
2996 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2997   if (!java_thread->is_valid()) {
2998     java_thread = rdi;
2999     get_thread(java_thread);
3000   }
3001   // we must set sp to zero to clear frame
3002   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3003   // must clear fp, so that compiled frames are not confused; it is
3004   // possible that we need it only for debugging
3005   if (clear_fp) {
3006     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3007   }
3008   // Always clear the pc because it could have been set by make_walkable()
3009   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3010   vzeroupper();
3011 }
3012 
3013 void MacroAssembler::restore_rax(Register tmp) {
3014   if (tmp == noreg) pop(rax);
3015   else if (tmp != rax) mov(rax, tmp);
3016 }
3017 
3018 void MacroAssembler::round_to(Register reg, int modulus) {
3019   addptr(reg, modulus - 1);
3020   andptr(reg, -modulus);
3021 }
3022 
3023 void MacroAssembler::save_rax(Register tmp) {
3024   if (tmp == noreg) push(rax);
3025   else if (tmp != rax) mov(tmp, rax);
3026 }
3027 
3028 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
3029   if (at_return) {
3030     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
3031     // we may safely use rsp instead to perform the stack watermark check.
3032     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
3033     jcc(Assembler::above, slow_path);
3034     return;
3035   }
3036   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
3037   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3038 }
3039 
3040 // Calls to C land
3041 //
3042 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3043 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3044 // has to be reset to 0. This is required to allow proper stack traversal.
3045 void MacroAssembler::set_last_Java_frame(Register java_thread,
3046                                          Register last_java_sp,
3047                                          Register last_java_fp,
3048                                          address  last_java_pc) {
3049   vzeroupper();
3050   // determine java_thread register
3051   if (!java_thread->is_valid()) {
3052     java_thread = rdi;
3053     get_thread(java_thread);
3054   }
3055   // determine last_java_sp register
3056   if (!last_java_sp->is_valid()) {
3057     last_java_sp = rsp;
3058   }
3059 
3060   // last_java_fp is optional
3061 
3062   if (last_java_fp->is_valid()) {
3063     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3064   }
3065 
3066   // last_java_pc is optional
3067 
3068   if (last_java_pc != NULL) {
3069     lea(Address(java_thread,
3070                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3071         InternalAddress(last_java_pc));
3072 
3073   }
3074   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3075 }
3076 
3077 void MacroAssembler::shlptr(Register dst, int imm8) {
3078   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3079 }
3080 
3081 void MacroAssembler::shrptr(Register dst, int imm8) {
3082   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3083 }
3084 
3085 void MacroAssembler::sign_extend_byte(Register reg) {
3086   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3087     movsbl(reg, reg); // movsxb
3088   } else {
3089     shll(reg, 24);
3090     sarl(reg, 24);
3091   }
3092 }
3093 
3094 void MacroAssembler::sign_extend_short(Register reg) {
3095   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3096     movswl(reg, reg); // movsxw
3097   } else {
3098     shll(reg, 16);
3099     sarl(reg, 16);
3100   }
3101 }
3102 
3103 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3104   assert(reachable(src), "Address should be reachable");
3105   testl(dst, as_Address(src));
3106 }
3107 
3108 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3109   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3110   Assembler::pcmpeqb(dst, src);
3111 }
3112 
3113 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3114   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3115   Assembler::pcmpeqw(dst, src);
3116 }
3117 
3118 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3119   assert((dst->encoding() < 16),"XMM register should be 0-15");
3120   Assembler::pcmpestri(dst, src, imm8);
3121 }
3122 
3123 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3124   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3125   Assembler::pcmpestri(dst, src, imm8);
3126 }
3127 
3128 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3129   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3130   Assembler::pmovzxbw(dst, src);
3131 }
3132 
3133 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3134   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3135   Assembler::pmovzxbw(dst, src);
3136 }
3137 
3138 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3139   assert((src->encoding() < 16),"XMM register should be 0-15");
3140   Assembler::pmovmskb(dst, src);
3141 }
3142 
3143 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3144   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3145   Assembler::ptest(dst, src);
3146 }
3147 
3148 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3149   if (reachable(src)) {
3150     Assembler::sqrtsd(dst, as_Address(src));
3151   } else {
3152     lea(rscratch1, src);
3153     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3154   }
3155 }
3156 
3157 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3158   if (reachable(src)) {
3159     Assembler::sqrtss(dst, as_Address(src));
3160   } else {
3161     lea(rscratch1, src);
3162     Assembler::sqrtss(dst, Address(rscratch1, 0));
3163   }
3164 }
3165 
3166 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3167   if (reachable(src)) {
3168     Assembler::subsd(dst, as_Address(src));
3169   } else {
3170     lea(rscratch1, src);
3171     Assembler::subsd(dst, Address(rscratch1, 0));
3172   }
3173 }
3174 
3175 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
3176   if (reachable(src)) {
3177     Assembler::roundsd(dst, as_Address(src), rmode);
3178   } else {
3179     lea(scratch_reg, src);
3180     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
3181   }
3182 }
3183 
3184 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3185   if (reachable(src)) {
3186     Assembler::subss(dst, as_Address(src));
3187   } else {
3188     lea(rscratch1, src);
3189     Assembler::subss(dst, Address(rscratch1, 0));
3190   }
3191 }
3192 
3193 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3194   if (reachable(src)) {
3195     Assembler::ucomisd(dst, as_Address(src));
3196   } else {
3197     lea(rscratch1, src);
3198     Assembler::ucomisd(dst, Address(rscratch1, 0));
3199   }
3200 }
3201 
3202 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3203   if (reachable(src)) {
3204     Assembler::ucomiss(dst, as_Address(src));
3205   } else {
3206     lea(rscratch1, src);
3207     Assembler::ucomiss(dst, Address(rscratch1, 0));
3208   }
3209 }
3210 
3211 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3212   // Used in sign-bit flipping with aligned address.
3213   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3214   if (reachable(src)) {
3215     Assembler::xorpd(dst, as_Address(src));
3216   } else {
3217     lea(scratch_reg, src);
3218     Assembler::xorpd(dst, Address(scratch_reg, 0));
3219   }
3220 }
3221 
3222 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3223   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3224     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3225   }
3226   else {
3227     Assembler::xorpd(dst, src);
3228   }
3229 }
3230 
3231 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3232   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3233     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3234   } else {
3235     Assembler::xorps(dst, src);
3236   }
3237 }
3238 
3239 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3240   // Used in sign-bit flipping with aligned address.
3241   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3242   if (reachable(src)) {
3243     Assembler::xorps(dst, as_Address(src));
3244   } else {
3245     lea(scratch_reg, src);
3246     Assembler::xorps(dst, Address(scratch_reg, 0));
3247   }
3248 }
3249 
3250 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3251   // Used in sign-bit flipping with aligned address.
3252   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3253   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3254   if (reachable(src)) {
3255     Assembler::pshufb(dst, as_Address(src));
3256   } else {
3257     lea(rscratch1, src);
3258     Assembler::pshufb(dst, Address(rscratch1, 0));
3259   }
3260 }
3261 
3262 // AVX 3-operands instructions
3263 
3264 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3265   if (reachable(src)) {
3266     vaddsd(dst, nds, as_Address(src));
3267   } else {
3268     lea(rscratch1, src);
3269     vaddsd(dst, nds, Address(rscratch1, 0));
3270   }
3271 }
3272 
3273 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3274   if (reachable(src)) {
3275     vaddss(dst, nds, as_Address(src));
3276   } else {
3277     lea(rscratch1, src);
3278     vaddss(dst, nds, Address(rscratch1, 0));
3279   }
3280 }
3281 
3282 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3283   assert(UseAVX > 0, "requires some form of AVX");
3284   if (reachable(src)) {
3285     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3286   } else {
3287     lea(rscratch, src);
3288     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3289   }
3290 }
3291 
3292 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3293   assert(UseAVX > 0, "requires some form of AVX");
3294   if (reachable(src)) {
3295     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3296   } else {
3297     lea(rscratch, src);
3298     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3299   }
3300 }
3301 
3302 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3303   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3304   vandps(dst, nds, negate_field, vector_len);
3305 }
3306 
3307 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3308   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3309   vandpd(dst, nds, negate_field, vector_len);
3310 }
3311 
3312 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3313   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3314   Assembler::vpaddb(dst, nds, src, vector_len);
3315 }
3316 
3317 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3318   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3319   Assembler::vpaddb(dst, nds, src, vector_len);
3320 }
3321 
3322 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3323   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3324   Assembler::vpaddw(dst, nds, src, vector_len);
3325 }
3326 
3327 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3328   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3329   Assembler::vpaddw(dst, nds, src, vector_len);
3330 }
3331 
3332 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3333   if (reachable(src)) {
3334     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3335   } else {
3336     lea(scratch_reg, src);
3337     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3338   }
3339 }
3340 
3341 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3342   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3343   Assembler::vpbroadcastw(dst, src, vector_len);
3344 }
3345 
3346 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3347   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3348   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3349 }
3350 
3351 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3352   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3353   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3354 }
3355 
3356 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3357                                AddressLiteral src, int vector_len, Register scratch_reg) {
3358   if (reachable(src)) {
3359     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3360   } else {
3361     lea(scratch_reg, src);
3362     Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3363   }
3364 }
3365 
3366 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3367                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3368   if (reachable(src)) {
3369     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3370   } else {
3371     lea(scratch_reg, src);
3372     Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3373   }
3374 }
3375 
3376 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3377                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3378   if (reachable(src)) {
3379     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3380   } else {
3381     lea(scratch_reg, src);
3382     Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3383   }
3384 }
3385 
3386 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3387                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3388   if (reachable(src)) {
3389     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3390   } else {
3391     lea(scratch_reg, src);
3392     Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3393   }
3394 }
3395 
3396 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3397                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3398   if (reachable(src)) {
3399     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3400   } else {
3401     lea(scratch_reg, src);
3402     Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3403   }
3404 }
3405 
3406 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3407   if (width == Assembler::Q) {
3408     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3409   } else {
3410     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3411   }
3412 }
3413 
3414 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg) {
3415   int eq_cond_enc = 0x29;
3416   int gt_cond_enc = 0x37;
3417   if (width != Assembler::Q) {
3418     eq_cond_enc = 0x74 + width;
3419     gt_cond_enc = 0x64 + width;
3420   }
3421   switch (cond) {
3422   case eq:
3423     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3424     break;
3425   case neq:
3426     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3427     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3428     break;
3429   case le:
3430     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3431     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3432     break;
3433   case nlt:
3434     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3435     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3436     break;
3437   case lt:
3438     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3439     break;
3440   case nle:
3441     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3442     break;
3443   default:
3444     assert(false, "Should not reach here");
3445   }
3446 }
3447 
3448 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3449   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3450   Assembler::vpmovzxbw(dst, src, vector_len);
3451 }
3452 
3453 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3454   assert((src->encoding() < 16),"XMM register should be 0-15");
3455   Assembler::vpmovmskb(dst, src, vector_len);
3456 }
3457 
3458 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3459   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3460   Assembler::vpmullw(dst, nds, src, vector_len);
3461 }
3462 
3463 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3464   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3465   Assembler::vpmullw(dst, nds, src, vector_len);
3466 }
3467 
3468 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3469   assert((UseAVX > 0), "AVX support is needed");
3470   if (reachable(src)) {
3471     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3472   } else {
3473     lea(scratch_reg, src);
3474     Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3475   }
3476 }
3477 
3478 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3479   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3480   Assembler::vpsubb(dst, nds, src, vector_len);
3481 }
3482 
3483 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3484   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3485   Assembler::vpsubb(dst, nds, src, vector_len);
3486 }
3487 
3488 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3489   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3490   Assembler::vpsubw(dst, nds, src, vector_len);
3491 }
3492 
3493 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3494   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3495   Assembler::vpsubw(dst, nds, src, vector_len);
3496 }
3497 
3498 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3499   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3500   Assembler::vpsraw(dst, nds, shift, vector_len);
3501 }
3502 
3503 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3504   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3505   Assembler::vpsraw(dst, nds, shift, vector_len);
3506 }
3507 
3508 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3509   assert(UseAVX > 2,"");
3510   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3511      vector_len = 2;
3512   }
3513   Assembler::evpsraq(dst, nds, shift, vector_len);
3514 }
3515 
3516 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3517   assert(UseAVX > 2,"");
3518   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3519      vector_len = 2;
3520   }
3521   Assembler::evpsraq(dst, nds, shift, vector_len);
3522 }
3523 
3524 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3525   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3526   Assembler::vpsrlw(dst, nds, shift, vector_len);
3527 }
3528 
3529 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3530   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3531   Assembler::vpsrlw(dst, nds, shift, vector_len);
3532 }
3533 
3534 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3535   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3536   Assembler::vpsllw(dst, nds, shift, vector_len);
3537 }
3538 
3539 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3540   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3541   Assembler::vpsllw(dst, nds, shift, vector_len);
3542 }
3543 
3544 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3545   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3546   Assembler::vptest(dst, src);
3547 }
3548 
3549 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3550   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3551   Assembler::punpcklbw(dst, src);
3552 }
3553 
3554 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3555   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3556   Assembler::pshufd(dst, src, mode);
3557 }
3558 
3559 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3560   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3561   Assembler::pshuflw(dst, src, mode);
3562 }
3563 
3564 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3565   if (reachable(src)) {
3566     vandpd(dst, nds, as_Address(src), vector_len);
3567   } else {
3568     lea(scratch_reg, src);
3569     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3570   }
3571 }
3572 
3573 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3574   if (reachable(src)) {
3575     vandps(dst, nds, as_Address(src), vector_len);
3576   } else {
3577     lea(scratch_reg, src);
3578     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3579   }
3580 }
3581 
3582 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3583                             bool merge, int vector_len, Register scratch_reg) {
3584   if (reachable(src)) {
3585     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3586   } else {
3587     lea(scratch_reg, src);
3588     Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3589   }
3590 }
3591 
3592 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3593   if (reachable(src)) {
3594     vdivsd(dst, nds, as_Address(src));
3595   } else {
3596     lea(rscratch1, src);
3597     vdivsd(dst, nds, Address(rscratch1, 0));
3598   }
3599 }
3600 
3601 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3602   if (reachable(src)) {
3603     vdivss(dst, nds, as_Address(src));
3604   } else {
3605     lea(rscratch1, src);
3606     vdivss(dst, nds, Address(rscratch1, 0));
3607   }
3608 }
3609 
3610 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3611   if (reachable(src)) {
3612     vmulsd(dst, nds, as_Address(src));
3613   } else {
3614     lea(rscratch1, src);
3615     vmulsd(dst, nds, Address(rscratch1, 0));
3616   }
3617 }
3618 
3619 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3620   if (reachable(src)) {
3621     vmulss(dst, nds, as_Address(src));
3622   } else {
3623     lea(rscratch1, src);
3624     vmulss(dst, nds, Address(rscratch1, 0));
3625   }
3626 }
3627 
3628 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3629   if (reachable(src)) {
3630     vsubsd(dst, nds, as_Address(src));
3631   } else {
3632     lea(rscratch1, src);
3633     vsubsd(dst, nds, Address(rscratch1, 0));
3634   }
3635 }
3636 
3637 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3638   if (reachable(src)) {
3639     vsubss(dst, nds, as_Address(src));
3640   } else {
3641     lea(rscratch1, src);
3642     vsubss(dst, nds, Address(rscratch1, 0));
3643   }
3644 }
3645 
3646 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3647   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3648   vxorps(dst, nds, src, Assembler::AVX_128bit);
3649 }
3650 
3651 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3652   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3653   vxorpd(dst, nds, src, Assembler::AVX_128bit);
3654 }
3655 
3656 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3657   if (reachable(src)) {
3658     vxorpd(dst, nds, as_Address(src), vector_len);
3659   } else {
3660     lea(scratch_reg, src);
3661     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3662   }
3663 }
3664 
3665 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3666   if (reachable(src)) {
3667     vxorps(dst, nds, as_Address(src), vector_len);
3668   } else {
3669     lea(scratch_reg, src);
3670     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3671   }
3672 }
3673 
3674 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3675   if (UseAVX > 1 || (vector_len < 1)) {
3676     if (reachable(src)) {
3677       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3678     } else {
3679       lea(scratch_reg, src);
3680       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3681     }
3682   }
3683   else {
3684     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3685   }
3686 }
3687 
3688 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3689   if (reachable(src)) {
3690     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3691   } else {
3692     lea(scratch_reg, src);
3693     Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3694   }
3695 }
3696 
3697 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3698   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3699   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3700   // The inverted mask is sign-extended
3701   andptr(possibly_jweak, inverted_jweak_mask);
3702 }
3703 
3704 void MacroAssembler::resolve_jobject(Register value,
3705                                      Register thread,
3706                                      Register tmp) {
3707   assert_different_registers(value, thread, tmp);
3708   Label done, not_weak;
3709   testptr(value, value);
3710   jcc(Assembler::zero, done);                // Use NULL as-is.
3711   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3712   jcc(Assembler::zero, not_weak);
3713   // Resolve jweak.
3714   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3715                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3716   verify_oop(value);
3717   jmp(done);
3718   bind(not_weak);
3719   // Resolve (untagged) jobject.
3720   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3721   verify_oop(value);
3722   bind(done);
3723 }
3724 
3725 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3726   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3727 }
3728 
3729 // Force generation of a 4 byte immediate value even if it fits into 8bit
3730 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3731   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3732 }
3733 
3734 void MacroAssembler::subptr(Register dst, Register src) {
3735   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3736 }
3737 
3738 // C++ bool manipulation
3739 void MacroAssembler::testbool(Register dst) {
3740   if(sizeof(bool) == 1)
3741     testb(dst, 0xff);
3742   else if(sizeof(bool) == 2) {
3743     // testw implementation needed for two byte bools
3744     ShouldNotReachHere();
3745   } else if(sizeof(bool) == 4)
3746     testl(dst, dst);
3747   else
3748     // unsupported
3749     ShouldNotReachHere();
3750 }
3751 
3752 void MacroAssembler::testptr(Register dst, Register src) {
3753   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3754 }
3755 
3756 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3757 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3758                                    Register var_size_in_bytes,
3759                                    int con_size_in_bytes,
3760                                    Register t1,
3761                                    Register t2,
3762                                    Label& slow_case) {
3763   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3764   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3765 }
3766 
3767 // Defines obj, preserves var_size_in_bytes
3768 void MacroAssembler::eden_allocate(Register thread, Register obj,
3769                                    Register var_size_in_bytes,
3770                                    int con_size_in_bytes,
3771                                    Register t1,
3772                                    Label& slow_case) {
3773   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3774   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3775 }
3776 
3777 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3778 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3779   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3780   assert((offset_in_bytes & (BytesPerInt - 1)) == 0, "offset must be a multiple of BytesPerInt");
3781   Label done;
3782 
3783   testptr(length_in_bytes, length_in_bytes);
3784   jcc(Assembler::zero, done);
3785 
3786   // Emit single 32bit store to clear leading bytes, if necessary.
3787   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
3788 #ifdef _LP64
3789   if (!is_aligned(offset_in_bytes, BytesPerWord)) {
3790     movl(Address(address, offset_in_bytes), temp);
3791     offset_in_bytes += BytesPerInt;
3792     decrement(length_in_bytes, BytesPerInt);
3793   }
3794   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3795   testptr(length_in_bytes, length_in_bytes);
3796   jcc(Assembler::zero, done);
3797 #endif
3798 
3799   // initialize topmost word, divide index by 2, check if odd and test if zero
3800   // note: for the remaining code to work, index must be a multiple of BytesPerWord
3801 #ifdef ASSERT
3802   {
3803     Label L;
3804     testptr(length_in_bytes, BytesPerWord - 1);
3805     jcc(Assembler::zero, L);
3806     stop("length must be a multiple of BytesPerWord");
3807     bind(L);
3808   }
3809 #endif
3810   Register index = length_in_bytes;
3811   if (UseIncDec) {
3812     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
3813   } else {
3814     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
3815     shrptr(index, 1);
3816   }
3817 #ifndef _LP64
3818   // index could have not been a multiple of 8 (i.e., bit 2 was set)
3819   {
3820     Label even;
3821     // note: if index was a multiple of 8, then it cannot
3822     //       be 0 now otherwise it must have been 0 before
3823     //       => if it is even, we don't need to check for 0 again
3824     jcc(Assembler::carryClear, even);
3825     // clear topmost word (no jump would be needed if conditional assignment worked here)
3826     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3827     // index could be 0 now, must check again
3828     jcc(Assembler::zero, done);
3829     bind(even);
3830   }
3831 #endif // !_LP64
3832   // initialize remaining object fields: index is a multiple of 2 now
3833   {
3834     Label loop;
3835     bind(loop);
3836     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3837     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3838     decrement(index);
3839     jcc(Assembler::notZero, loop);
3840   }
3841 
3842   bind(done);
3843 }
3844 
3845 // Look up the method for a megamorphic invokeinterface call.
3846 // The target method is determined by <intf_klass, itable_index>.
3847 // The receiver klass is in recv_klass.
3848 // On success, the result will be in method_result, and execution falls through.
3849 // On failure, execution transfers to the given label.
3850 void MacroAssembler::lookup_interface_method(Register recv_klass,
3851                                              Register intf_klass,
3852                                              RegisterOrConstant itable_index,
3853                                              Register method_result,
3854                                              Register scan_temp,
3855                                              Label& L_no_such_interface,
3856                                              bool return_method) {
3857   assert_different_registers(recv_klass, intf_klass, scan_temp);
3858   assert_different_registers(method_result, intf_klass, scan_temp);
3859   assert(recv_klass != method_result || !return_method,
3860          "recv_klass can be destroyed when method isn't needed");
3861 
3862   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3863          "caller must use same register for non-constant itable index as for method");
3864 
3865   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3866   int vtable_base = in_bytes(Klass::vtable_start_offset());
3867   int itentry_off = itableMethodEntry::method_offset_in_bytes();
3868   int scan_step   = itableOffsetEntry::size() * wordSize;
3869   int vte_size    = vtableEntry::size_in_bytes();
3870   Address::ScaleFactor times_vte_scale = Address::times_ptr;
3871   assert(vte_size == wordSize, "else adjust times_vte_scale");
3872 
3873   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3874 
3875   // %%% Could store the aligned, prescaled offset in the klassoop.
3876   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3877 
3878   if (return_method) {
3879     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3880     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3881     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3882   }
3883 
3884   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3885   //   if (scan->interface() == intf) {
3886   //     result = (klass + scan->offset() + itable_index);
3887   //   }
3888   // }
3889   Label search, found_method;
3890 
3891   for (int peel = 1; peel >= 0; peel--) {
3892     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3893     cmpptr(intf_klass, method_result);
3894 
3895     if (peel) {
3896       jccb(Assembler::equal, found_method);
3897     } else {
3898       jccb(Assembler::notEqual, search);
3899       // (invert the test to fall through to found_method...)
3900     }
3901 
3902     if (!peel)  break;
3903 
3904     bind(search);
3905 
3906     // Check that the previous entry is non-null.  A null entry means that
3907     // the receiver class doesn't implement the interface, and wasn't the
3908     // same as when the caller was compiled.
3909     testptr(method_result, method_result);
3910     jcc(Assembler::zero, L_no_such_interface);
3911     addptr(scan_temp, scan_step);
3912   }
3913 
3914   bind(found_method);
3915 
3916   if (return_method) {
3917     // Got a hit.
3918     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3919     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3920   }
3921 }
3922 
3923 
3924 // virtual method calling
3925 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3926                                            RegisterOrConstant vtable_index,
3927                                            Register method_result) {
3928   const int base = in_bytes(Klass::vtable_start_offset());
3929   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3930   Address vtable_entry_addr(recv_klass,
3931                             vtable_index, Address::times_ptr,
3932                             base + vtableEntry::method_offset_in_bytes());
3933   movptr(method_result, vtable_entry_addr);
3934 }
3935 
3936 
3937 void MacroAssembler::check_klass_subtype(Register sub_klass,
3938                            Register super_klass,
3939                            Register temp_reg,
3940                            Label& L_success) {
3941   Label L_failure;
3942   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
3943   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3944   bind(L_failure);
3945 }
3946 
3947 
3948 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3949                                                    Register super_klass,
3950                                                    Register temp_reg,
3951                                                    Label* L_success,
3952                                                    Label* L_failure,
3953                                                    Label* L_slow_path,
3954                                         RegisterOrConstant super_check_offset) {
3955   assert_different_registers(sub_klass, super_klass, temp_reg);
3956   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3957   if (super_check_offset.is_register()) {
3958     assert_different_registers(sub_klass, super_klass,
3959                                super_check_offset.as_register());
3960   } else if (must_load_sco) {
3961     assert(temp_reg != noreg, "supply either a temp or a register offset");
3962   }
3963 
3964   Label L_fallthrough;
3965   int label_nulls = 0;
3966   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3967   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3968   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3969   assert(label_nulls <= 1, "at most one NULL in the batch");
3970 
3971   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3972   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3973   Address super_check_offset_addr(super_klass, sco_offset);
3974 
3975   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3976   // range of a jccb.  If this routine grows larger, reconsider at
3977   // least some of these.
3978 #define local_jcc(assembler_cond, label)                                \
3979   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
3980   else                             jcc( assembler_cond, label) /*omit semi*/
3981 
3982   // Hacked jmp, which may only be used just before L_fallthrough.
3983 #define final_jmp(label)                                                \
3984   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3985   else                            jmp(label)                /*omit semi*/
3986 
3987   // If the pointers are equal, we are done (e.g., String[] elements).
3988   // This self-check enables sharing of secondary supertype arrays among
3989   // non-primary types such as array-of-interface.  Otherwise, each such
3990   // type would need its own customized SSA.
3991   // We move this check to the front of the fast path because many
3992   // type checks are in fact trivially successful in this manner,
3993   // so we get a nicely predicted branch right at the start of the check.
3994   cmpptr(sub_klass, super_klass);
3995   local_jcc(Assembler::equal, *L_success);
3996 
3997   // Check the supertype display:
3998   if (must_load_sco) {
3999     // Positive movl does right thing on LP64.
4000     movl(temp_reg, super_check_offset_addr);
4001     super_check_offset = RegisterOrConstant(temp_reg);
4002   }
4003   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4004   cmpptr(super_klass, super_check_addr); // load displayed supertype
4005 
4006   // This check has worked decisively for primary supers.
4007   // Secondary supers are sought in the super_cache ('super_cache_addr').
4008   // (Secondary supers are interfaces and very deeply nested subtypes.)
4009   // This works in the same check above because of a tricky aliasing
4010   // between the super_cache and the primary super display elements.
4011   // (The 'super_check_addr' can address either, as the case requires.)
4012   // Note that the cache is updated below if it does not help us find
4013   // what we need immediately.
4014   // So if it was a primary super, we can just fail immediately.
4015   // Otherwise, it's the slow path for us (no success at this point).
4016 
4017   if (super_check_offset.is_register()) {
4018     local_jcc(Assembler::equal, *L_success);
4019     cmpl(super_check_offset.as_register(), sc_offset);
4020     if (L_failure == &L_fallthrough) {
4021       local_jcc(Assembler::equal, *L_slow_path);
4022     } else {
4023       local_jcc(Assembler::notEqual, *L_failure);
4024       final_jmp(*L_slow_path);
4025     }
4026   } else if (super_check_offset.as_constant() == sc_offset) {
4027     // Need a slow path; fast failure is impossible.
4028     if (L_slow_path == &L_fallthrough) {
4029       local_jcc(Assembler::equal, *L_success);
4030     } else {
4031       local_jcc(Assembler::notEqual, *L_slow_path);
4032       final_jmp(*L_success);
4033     }
4034   } else {
4035     // No slow path; it's a fast decision.
4036     if (L_failure == &L_fallthrough) {
4037       local_jcc(Assembler::equal, *L_success);
4038     } else {
4039       local_jcc(Assembler::notEqual, *L_failure);
4040       final_jmp(*L_success);
4041     }
4042   }
4043 
4044   bind(L_fallthrough);
4045 
4046 #undef local_jcc
4047 #undef final_jmp
4048 }
4049 
4050 
4051 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4052                                                    Register super_klass,
4053                                                    Register temp_reg,
4054                                                    Register temp2_reg,
4055                                                    Label* L_success,
4056                                                    Label* L_failure,
4057                                                    bool set_cond_codes) {
4058   assert_different_registers(sub_klass, super_klass, temp_reg);
4059   if (temp2_reg != noreg)
4060     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4061 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4062 
4063   Label L_fallthrough;
4064   int label_nulls = 0;
4065   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4066   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4067   assert(label_nulls <= 1, "at most one NULL in the batch");
4068 
4069   // a couple of useful fields in sub_klass:
4070   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4071   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4072   Address secondary_supers_addr(sub_klass, ss_offset);
4073   Address super_cache_addr(     sub_klass, sc_offset);
4074 
4075   // Do a linear scan of the secondary super-klass chain.
4076   // This code is rarely used, so simplicity is a virtue here.
4077   // The repne_scan instruction uses fixed registers, which we must spill.
4078   // Don't worry too much about pre-existing connections with the input regs.
4079 
4080   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4081   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4082 
4083   // Get super_klass value into rax (even if it was in rdi or rcx).
4084   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4085   if (super_klass != rax) {
4086     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4087     mov(rax, super_klass);
4088   }
4089   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4090   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4091 
4092 #ifndef PRODUCT
4093   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4094   ExternalAddress pst_counter_addr((address) pst_counter);
4095   NOT_LP64(  incrementl(pst_counter_addr) );
4096   LP64_ONLY( lea(rcx, pst_counter_addr) );
4097   LP64_ONLY( incrementl(Address(rcx, 0)) );
4098 #endif //PRODUCT
4099 
4100   // We will consult the secondary-super array.
4101   movptr(rdi, secondary_supers_addr);
4102   // Load the array length.  (Positive movl does right thing on LP64.)
4103   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4104   // Skip to start of data.
4105   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4106 
4107   // Scan RCX words at [RDI] for an occurrence of RAX.
4108   // Set NZ/Z based on last compare.
4109   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4110   // not change flags (only scas instruction which is repeated sets flags).
4111   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4112 
4113     testptr(rax,rax); // Set Z = 0
4114     repne_scan();
4115 
4116   // Unspill the temp. registers:
4117   if (pushed_rdi)  pop(rdi);
4118   if (pushed_rcx)  pop(rcx);
4119   if (pushed_rax)  pop(rax);
4120 
4121   if (set_cond_codes) {
4122     // Special hack for the AD files:  rdi is guaranteed non-zero.
4123     assert(!pushed_rdi, "rdi must be left non-NULL");
4124     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4125   }
4126 
4127   if (L_failure == &L_fallthrough)
4128         jccb(Assembler::notEqual, *L_failure);
4129   else  jcc(Assembler::notEqual, *L_failure);
4130 
4131   // Success.  Cache the super we found and proceed in triumph.
4132   movptr(super_cache_addr, super_klass);
4133 
4134   if (L_success != &L_fallthrough) {
4135     jmp(*L_success);
4136   }
4137 
4138 #undef IS_A_TEMP
4139 
4140   bind(L_fallthrough);
4141 }
4142 
4143 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4144   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4145 
4146   Label L_fallthrough;
4147   if (L_fast_path == NULL) {
4148     L_fast_path = &L_fallthrough;
4149   } else if (L_slow_path == NULL) {
4150     L_slow_path = &L_fallthrough;
4151   }
4152 
4153   // Fast path check: class is fully initialized
4154   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4155   jcc(Assembler::equal, *L_fast_path);
4156 
4157   // Fast path check: current thread is initializer thread
4158   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4159   if (L_slow_path == &L_fallthrough) {
4160     jcc(Assembler::equal, *L_fast_path);
4161     bind(*L_slow_path);
4162   } else if (L_fast_path == &L_fallthrough) {
4163     jcc(Assembler::notEqual, *L_slow_path);
4164     bind(*L_fast_path);
4165   } else {
4166     Unimplemented();
4167   }
4168 }
4169 
4170 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4171   if (VM_Version::supports_cmov()) {
4172     cmovl(cc, dst, src);
4173   } else {
4174     Label L;
4175     jccb(negate_condition(cc), L);
4176     movl(dst, src);
4177     bind(L);
4178   }
4179 }
4180 
4181 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4182   if (VM_Version::supports_cmov()) {
4183     cmovl(cc, dst, src);
4184   } else {
4185     Label L;
4186     jccb(negate_condition(cc), L);
4187     movl(dst, src);
4188     bind(L);
4189   }
4190 }
4191 
4192 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4193   if (!VerifyOops) return;
4194 
4195   // Pass register number to verify_oop_subroutine
4196   const char* b = NULL;
4197   {
4198     ResourceMark rm;
4199     stringStream ss;
4200     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4201     b = code_string(ss.as_string());
4202   }
4203   BLOCK_COMMENT("verify_oop {");
4204 #ifdef _LP64
4205   push(rscratch1);                    // save r10, trashed by movptr()
4206 #endif
4207   push(rax);                          // save rax,
4208   push(reg);                          // pass register argument
4209   ExternalAddress buffer((address) b);
4210   // avoid using pushptr, as it modifies scratch registers
4211   // and our contract is not to modify anything
4212   movptr(rax, buffer.addr());
4213   push(rax);
4214   // call indirectly to solve generation ordering problem
4215   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4216   call(rax);
4217   // Caller pops the arguments (oop, message) and restores rax, r10
4218   BLOCK_COMMENT("} verify_oop");
4219 }
4220 
4221 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4222   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4223     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4224   } else {
4225     assert(UseAVX > 0, "");
4226     vpcmpeqb(dst, dst, dst, vector_len);
4227   }
4228 }
4229 
4230 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4231                                          int extra_slot_offset) {
4232   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4233   int stackElementSize = Interpreter::stackElementSize;
4234   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4235 #ifdef ASSERT
4236   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4237   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4238 #endif
4239   Register             scale_reg    = noreg;
4240   Address::ScaleFactor scale_factor = Address::no_scale;
4241   if (arg_slot.is_constant()) {
4242     offset += arg_slot.as_constant() * stackElementSize;
4243   } else {
4244     scale_reg    = arg_slot.as_register();
4245     scale_factor = Address::times(stackElementSize);
4246   }
4247   offset += wordSize;           // return PC is on stack
4248   return Address(rsp, scale_reg, scale_factor, offset);
4249 }
4250 
4251 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4252   if (!VerifyOops) return;
4253 
4254   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4255   // Pass register number to verify_oop_subroutine
4256   const char* b = NULL;
4257   {
4258     ResourceMark rm;
4259     stringStream ss;
4260     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4261     b = code_string(ss.as_string());
4262   }
4263 #ifdef _LP64
4264   push(rscratch1);                    // save r10, trashed by movptr()
4265 #endif
4266   push(rax);                          // save rax,
4267   // addr may contain rsp so we will have to adjust it based on the push
4268   // we just did (and on 64 bit we do two pushes)
4269   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4270   // stores rax into addr which is backwards of what was intended.
4271   if (addr.uses(rsp)) {
4272     lea(rax, addr);
4273     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4274   } else {
4275     pushptr(addr);
4276   }
4277 
4278   ExternalAddress buffer((address) b);
4279   // pass msg argument
4280   // avoid using pushptr, as it modifies scratch registers
4281   // and our contract is not to modify anything
4282   movptr(rax, buffer.addr());
4283   push(rax);
4284 
4285   // call indirectly to solve generation ordering problem
4286   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4287   call(rax);
4288   // Caller pops the arguments (addr, message) and restores rax, r10.
4289 }
4290 
4291 void MacroAssembler::verify_tlab() {
4292 #ifdef ASSERT
4293   if (UseTLAB && VerifyOops) {
4294     Label next, ok;
4295     Register t1 = rsi;
4296     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4297 
4298     push(t1);
4299     NOT_LP64(push(thread_reg));
4300     NOT_LP64(get_thread(thread_reg));
4301 
4302     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4303     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4304     jcc(Assembler::aboveEqual, next);
4305     STOP("assert(top >= start)");
4306     should_not_reach_here();
4307 
4308     bind(next);
4309     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4310     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4311     jcc(Assembler::aboveEqual, ok);
4312     STOP("assert(top <= end)");
4313     should_not_reach_here();
4314 
4315     bind(ok);
4316     NOT_LP64(pop(thread_reg));
4317     pop(t1);
4318   }
4319 #endif
4320 }
4321 
4322 class ControlWord {
4323  public:
4324   int32_t _value;
4325 
4326   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4327   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4328   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4329   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4330   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4331   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4332   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4333   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4334 
4335   void print() const {
4336     // rounding control
4337     const char* rc;
4338     switch (rounding_control()) {
4339       case 0: rc = "round near"; break;
4340       case 1: rc = "round down"; break;
4341       case 2: rc = "round up  "; break;
4342       case 3: rc = "chop      "; break;
4343       default:
4344         rc = NULL; // silence compiler warnings
4345         fatal("Unknown rounding control: %d", rounding_control());
4346     };
4347     // precision control
4348     const char* pc;
4349     switch (precision_control()) {
4350       case 0: pc = "24 bits "; break;
4351       case 1: pc = "reserved"; break;
4352       case 2: pc = "53 bits "; break;
4353       case 3: pc = "64 bits "; break;
4354       default:
4355         pc = NULL; // silence compiler warnings
4356         fatal("Unknown precision control: %d", precision_control());
4357     };
4358     // flags
4359     char f[9];
4360     f[0] = ' ';
4361     f[1] = ' ';
4362     f[2] = (precision   ()) ? 'P' : 'p';
4363     f[3] = (underflow   ()) ? 'U' : 'u';
4364     f[4] = (overflow    ()) ? 'O' : 'o';
4365     f[5] = (zero_divide ()) ? 'Z' : 'z';
4366     f[6] = (denormalized()) ? 'D' : 'd';
4367     f[7] = (invalid     ()) ? 'I' : 'i';
4368     f[8] = '\x0';
4369     // output
4370     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4371   }
4372 
4373 };
4374 
4375 class StatusWord {
4376  public:
4377   int32_t _value;
4378 
4379   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4380   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4381   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4382   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4383   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4384   int  top() const                     { return  (_value >> 11) & 7      ; }
4385   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4386   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4387   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4388   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4389   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4390   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4391   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4392   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4393 
4394   void print() const {
4395     // condition codes
4396     char c[5];
4397     c[0] = (C3()) ? '3' : '-';
4398     c[1] = (C2()) ? '2' : '-';
4399     c[2] = (C1()) ? '1' : '-';
4400     c[3] = (C0()) ? '0' : '-';
4401     c[4] = '\x0';
4402     // flags
4403     char f[9];
4404     f[0] = (error_status()) ? 'E' : '-';
4405     f[1] = (stack_fault ()) ? 'S' : '-';
4406     f[2] = (precision   ()) ? 'P' : '-';
4407     f[3] = (underflow   ()) ? 'U' : '-';
4408     f[4] = (overflow    ()) ? 'O' : '-';
4409     f[5] = (zero_divide ()) ? 'Z' : '-';
4410     f[6] = (denormalized()) ? 'D' : '-';
4411     f[7] = (invalid     ()) ? 'I' : '-';
4412     f[8] = '\x0';
4413     // output
4414     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4415   }
4416 
4417 };
4418 
4419 class TagWord {
4420  public:
4421   int32_t _value;
4422 
4423   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4424 
4425   void print() const {
4426     printf("%04x", _value & 0xFFFF);
4427   }
4428 
4429 };
4430 
4431 class FPU_Register {
4432  public:
4433   int32_t _m0;
4434   int32_t _m1;
4435   int16_t _ex;
4436 
4437   bool is_indefinite() const           {
4438     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4439   }
4440 
4441   void print() const {
4442     char  sign = (_ex < 0) ? '-' : '+';
4443     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4444     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4445   };
4446 
4447 };
4448 
4449 class FPU_State {
4450  public:
4451   enum {
4452     register_size       = 10,
4453     number_of_registers =  8,
4454     register_mask       =  7
4455   };
4456 
4457   ControlWord  _control_word;
4458   StatusWord   _status_word;
4459   TagWord      _tag_word;
4460   int32_t      _error_offset;
4461   int32_t      _error_selector;
4462   int32_t      _data_offset;
4463   int32_t      _data_selector;
4464   int8_t       _register[register_size * number_of_registers];
4465 
4466   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4467   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4468 
4469   const char* tag_as_string(int tag) const {
4470     switch (tag) {
4471       case 0: return "valid";
4472       case 1: return "zero";
4473       case 2: return "special";
4474       case 3: return "empty";
4475     }
4476     ShouldNotReachHere();
4477     return NULL;
4478   }
4479 
4480   void print() const {
4481     // print computation registers
4482     { int t = _status_word.top();
4483       for (int i = 0; i < number_of_registers; i++) {
4484         int j = (i - t) & register_mask;
4485         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4486         st(j)->print();
4487         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4488       }
4489     }
4490     printf("\n");
4491     // print control registers
4492     printf("ctrl = "); _control_word.print(); printf("\n");
4493     printf("stat = "); _status_word .print(); printf("\n");
4494     printf("tags = "); _tag_word    .print(); printf("\n");
4495   }
4496 
4497 };
4498 
4499 class Flag_Register {
4500  public:
4501   int32_t _value;
4502 
4503   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4504   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4505   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4506   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4507   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4508   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4509   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4510 
4511   void print() const {
4512     // flags
4513     char f[8];
4514     f[0] = (overflow       ()) ? 'O' : '-';
4515     f[1] = (direction      ()) ? 'D' : '-';
4516     f[2] = (sign           ()) ? 'S' : '-';
4517     f[3] = (zero           ()) ? 'Z' : '-';
4518     f[4] = (auxiliary_carry()) ? 'A' : '-';
4519     f[5] = (parity         ()) ? 'P' : '-';
4520     f[6] = (carry          ()) ? 'C' : '-';
4521     f[7] = '\x0';
4522     // output
4523     printf("%08x  flags = %s", _value, f);
4524   }
4525 
4526 };
4527 
4528 class IU_Register {
4529  public:
4530   int32_t _value;
4531 
4532   void print() const {
4533     printf("%08x  %11d", _value, _value);
4534   }
4535 
4536 };
4537 
4538 class IU_State {
4539  public:
4540   Flag_Register _eflags;
4541   IU_Register   _rdi;
4542   IU_Register   _rsi;
4543   IU_Register   _rbp;
4544   IU_Register   _rsp;
4545   IU_Register   _rbx;
4546   IU_Register   _rdx;
4547   IU_Register   _rcx;
4548   IU_Register   _rax;
4549 
4550   void print() const {
4551     // computation registers
4552     printf("rax,  = "); _rax.print(); printf("\n");
4553     printf("rbx,  = "); _rbx.print(); printf("\n");
4554     printf("rcx  = "); _rcx.print(); printf("\n");
4555     printf("rdx  = "); _rdx.print(); printf("\n");
4556     printf("rdi  = "); _rdi.print(); printf("\n");
4557     printf("rsi  = "); _rsi.print(); printf("\n");
4558     printf("rbp,  = "); _rbp.print(); printf("\n");
4559     printf("rsp  = "); _rsp.print(); printf("\n");
4560     printf("\n");
4561     // control registers
4562     printf("flgs = "); _eflags.print(); printf("\n");
4563   }
4564 };
4565 
4566 
4567 class CPU_State {
4568  public:
4569   FPU_State _fpu_state;
4570   IU_State  _iu_state;
4571 
4572   void print() const {
4573     printf("--------------------------------------------------\n");
4574     _iu_state .print();
4575     printf("\n");
4576     _fpu_state.print();
4577     printf("--------------------------------------------------\n");
4578   }
4579 
4580 };
4581 
4582 
4583 static void _print_CPU_state(CPU_State* state) {
4584   state->print();
4585 };
4586 
4587 
4588 void MacroAssembler::print_CPU_state() {
4589   push_CPU_state();
4590   push(rsp);                // pass CPU state
4591   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4592   addptr(rsp, wordSize);       // discard argument
4593   pop_CPU_state();
4594 }
4595 
4596 
4597 #ifndef _LP64
4598 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4599   static int counter = 0;
4600   FPU_State* fs = &state->_fpu_state;
4601   counter++;
4602   // For leaf calls, only verify that the top few elements remain empty.
4603   // We only need 1 empty at the top for C2 code.
4604   if( stack_depth < 0 ) {
4605     if( fs->tag_for_st(7) != 3 ) {
4606       printf("FPR7 not empty\n");
4607       state->print();
4608       assert(false, "error");
4609       return false;
4610     }
4611     return true;                // All other stack states do not matter
4612   }
4613 
4614   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4615          "bad FPU control word");
4616 
4617   // compute stack depth
4618   int i = 0;
4619   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4620   int d = i;
4621   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4622   // verify findings
4623   if (i != FPU_State::number_of_registers) {
4624     // stack not contiguous
4625     printf("%s: stack not contiguous at ST%d\n", s, i);
4626     state->print();
4627     assert(false, "error");
4628     return false;
4629   }
4630   // check if computed stack depth corresponds to expected stack depth
4631   if (stack_depth < 0) {
4632     // expected stack depth is -stack_depth or less
4633     if (d > -stack_depth) {
4634       // too many elements on the stack
4635       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4636       state->print();
4637       assert(false, "error");
4638       return false;
4639     }
4640   } else {
4641     // expected stack depth is stack_depth
4642     if (d != stack_depth) {
4643       // wrong stack depth
4644       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4645       state->print();
4646       assert(false, "error");
4647       return false;
4648     }
4649   }
4650   // everything is cool
4651   return true;
4652 }
4653 
4654 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4655   if (!VerifyFPU) return;
4656   push_CPU_state();
4657   push(rsp);                // pass CPU state
4658   ExternalAddress msg((address) s);
4659   // pass message string s
4660   pushptr(msg.addr());
4661   push(stack_depth);        // pass stack depth
4662   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4663   addptr(rsp, 3 * wordSize);   // discard arguments
4664   // check for error
4665   { Label L;
4666     testl(rax, rax);
4667     jcc(Assembler::notZero, L);
4668     int3();                  // break if error condition
4669     bind(L);
4670   }
4671   pop_CPU_state();
4672 }
4673 #endif // _LP64
4674 
4675 void MacroAssembler::restore_cpu_control_state_after_jni() {
4676   // Either restore the MXCSR register after returning from the JNI Call
4677   // or verify that it wasn't changed (with -Xcheck:jni flag).
4678   if (VM_Version::supports_sse()) {
4679     if (RestoreMXCSROnJNICalls) {
4680       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4681     } else if (CheckJNICalls) {
4682       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4683     }
4684   }
4685   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4686   vzeroupper();
4687   // Reset k1 to 0xffff.
4688 
4689 #ifdef COMPILER2
4690   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
4691     push(rcx);
4692     movl(rcx, 0xffff);
4693     kmovwl(k1, rcx);
4694     pop(rcx);
4695   }
4696 #endif // COMPILER2
4697 
4698 #ifndef _LP64
4699   // Either restore the x87 floating pointer control word after returning
4700   // from the JNI call or verify that it wasn't changed.
4701   if (CheckJNICalls) {
4702     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4703   }
4704 #endif // _LP64
4705 }
4706 
4707 // ((OopHandle)result).resolve();
4708 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4709   assert_different_registers(result, tmp);
4710 
4711   // Only 64 bit platforms support GCs that require a tmp register
4712   // Only IN_HEAP loads require a thread_tmp register
4713   // OopHandle::resolve is an indirection like jobject.
4714   access_load_at(T_OBJECT, IN_NATIVE,
4715                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4716 }
4717 
4718 // ((WeakHandle)result).resolve();
4719 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4720   assert_different_registers(rresult, rtmp);
4721   Label resolved;
4722 
4723   // A null weak handle resolves to null.
4724   cmpptr(rresult, 0);
4725   jcc(Assembler::equal, resolved);
4726 
4727   // Only 64 bit platforms support GCs that require a tmp register
4728   // Only IN_HEAP loads require a thread_tmp register
4729   // WeakHandle::resolve is an indirection like jweak.
4730   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4731                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4732   bind(resolved);
4733 }
4734 
4735 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4736   // get mirror
4737   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4738   load_method_holder(mirror, method);
4739   movptr(mirror, Address(mirror, mirror_offset));
4740   resolve_oop_handle(mirror, tmp);
4741 }
4742 
4743 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4744   load_method_holder(rresult, rmethod);
4745   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4746 }
4747 
4748 void MacroAssembler::load_method_holder(Register holder, Register method) {
4749   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4750   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4751   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4752 }
4753 
4754 #ifdef _LP64
4755 void MacroAssembler::load_nklass(Register dst, Register src) {
4756   assert(UseCompressedClassPointers, "expect compressed class pointers");
4757 
4758   if (!UseCompactObjectHeaders) {
4759     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4760     return;
4761   }
4762 
4763  Label fast;
4764   movq(dst, Address(src, oopDesc::mark_offset_in_bytes()));
4765   testb(dst, markWord::monitor_value);
4766   jccb(Assembler::zero, fast);
4767 
4768   // Fetch displaced header
4769   movq(dst, Address(dst, OM_OFFSET_NO_MONITOR_VALUE_TAG(header)));
4770 
4771   bind(fast);
4772   shrq(dst, markWord::klass_shift);
4773 }
4774 #endif
4775 
4776 void MacroAssembler::load_klass(Register dst, Register src, Register tmp, bool null_check_src) {
4777   assert_different_registers(src, tmp);
4778   assert_different_registers(dst, tmp);
4779   if (null_check_src) {
4780     if (UseCompactObjectHeaders) {
4781       null_check(src, oopDesc::mark_offset_in_bytes());
4782     } else {
4783       null_check(src, oopDesc::klass_offset_in_bytes());
4784     }
4785   }
4786 #ifdef _LP64
4787   if (UseCompressedClassPointers) {
4788     load_nklass(dst, src);
4789     decode_klass_not_null(dst, tmp);
4790   } else
4791 #endif
4792     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4793 }
4794 
4795 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
4796   load_klass(dst, src, tmp);
4797   movptr(dst, Address(dst, Klass::prototype_header_offset()));
4798 }
4799 
4800 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4801   assert(!UseCompactObjectHeaders, "not with compact headers");
4802   assert_different_registers(src, tmp);
4803   assert_different_registers(dst, tmp);
4804 #ifdef _LP64
4805   if (UseCompressedClassPointers) {
4806     encode_klass_not_null(src, tmp);
4807     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4808   } else
4809 #endif
4810    movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4811 }
4812 
4813 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) {
4814 #ifdef _LP64
4815   if (UseCompactObjectHeaders) {
4816     // NOTE: We need to deal with possible ObjectMonitor in object header.
4817     // Eventually we might be able to do simple movl & cmpl like in
4818     // the CCP path below.
4819     load_nklass(tmp, obj);
4820     cmpl(klass, tmp);
4821   } else if (UseCompressedClassPointers) {
4822     cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
4823   } else
4824 #endif
4825   {
4826     cmpptr(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
4827   }
4828 }
4829 
4830 void MacroAssembler::cmp_klass(Register src, Register dst, Register tmp1, Register tmp2) {
4831 #ifdef _LP64
4832   if (UseCompactObjectHeaders) {
4833     // NOTE: We need to deal with possible ObjectMonitor in object header.
4834     // Eventually we might be able to do simple movl & cmpl like in
4835     // the CCP path below.
4836     assert(tmp2 != noreg, "need tmp2");
4837     assert_different_registers(src, dst, tmp1, tmp2);
4838     load_nklass(tmp1, src);
4839     load_nklass(tmp2, dst);
4840     cmpl(tmp1, tmp2);
4841   } else if (UseCompressedClassPointers) {
4842     movl(tmp1, Address(src, oopDesc::klass_offset_in_bytes()));
4843     cmpl(tmp1, Address(dst, oopDesc::klass_offset_in_bytes()));
4844   } else
4845 #endif
4846   {
4847     movptr(tmp1, Address(src, oopDesc::klass_offset_in_bytes()));
4848     cmpptr(tmp1, Address(dst, oopDesc::klass_offset_in_bytes()));
4849   }
4850 }
4851 
4852 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4853                                     Register tmp1, Register thread_tmp) {
4854   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4855   decorators = AccessInternal::decorator_fixup(decorators);
4856   bool as_raw = (decorators & AS_RAW) != 0;
4857   if (as_raw) {
4858     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4859   } else {
4860     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4861   }
4862 }
4863 
4864 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4865                                      Register tmp1, Register tmp2) {
4866   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4867   decorators = AccessInternal::decorator_fixup(decorators);
4868   bool as_raw = (decorators & AS_RAW) != 0;
4869   if (as_raw) {
4870     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
4871   } else {
4872     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
4873   }
4874 }
4875 
4876 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4877                                    Register thread_tmp, DecoratorSet decorators) {
4878   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4879 }
4880 
4881 // Doesn't do verfication, generates fixed size code
4882 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4883                                             Register thread_tmp, DecoratorSet decorators) {
4884   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4885 }
4886 
4887 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4888                                     Register tmp2, DecoratorSet decorators) {
4889   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4890 }
4891 
4892 // Used for storing NULLs.
4893 void MacroAssembler::store_heap_oop_null(Address dst) {
4894   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4895 }
4896 
4897 #ifdef _LP64
4898 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4899   if (UseCompressedClassPointers) {
4900     // Store to klass gap in destination
4901     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4902   }
4903 }
4904 
4905 #ifdef ASSERT
4906 void MacroAssembler::verify_heapbase(const char* msg) {
4907   assert (UseCompressedOops, "should be compressed");
4908   assert (Universe::heap() != NULL, "java heap should be initialized");
4909   if (CheckCompressedOops) {
4910     Label ok;
4911     push(rscratch1); // cmpptr trashes rscratch1
4912     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4913     jcc(Assembler::equal, ok);
4914     STOP(msg);
4915     bind(ok);
4916     pop(rscratch1);
4917   }
4918 }
4919 #endif
4920 
4921 // Algorithm must match oop.inline.hpp encode_heap_oop.
4922 void MacroAssembler::encode_heap_oop(Register r) {
4923 #ifdef ASSERT
4924   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4925 #endif
4926   verify_oop_msg(r, "broken oop in encode_heap_oop");
4927   if (CompressedOops::base() == NULL) {
4928     if (CompressedOops::shift() != 0) {
4929       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4930       shrq(r, LogMinObjAlignmentInBytes);
4931     }
4932     return;
4933   }
4934   testq(r, r);
4935   cmovq(Assembler::equal, r, r12_heapbase);
4936   subq(r, r12_heapbase);
4937   shrq(r, LogMinObjAlignmentInBytes);
4938 }
4939 
4940 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4941 #ifdef ASSERT
4942   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4943   if (CheckCompressedOops) {
4944     Label ok;
4945     testq(r, r);
4946     jcc(Assembler::notEqual, ok);
4947     STOP("null oop passed to encode_heap_oop_not_null");
4948     bind(ok);
4949   }
4950 #endif
4951   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4952   if (CompressedOops::base() != NULL) {
4953     subq(r, r12_heapbase);
4954   }
4955   if (CompressedOops::shift() != 0) {
4956     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4957     shrq(r, LogMinObjAlignmentInBytes);
4958   }
4959 }
4960 
4961 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4962 #ifdef ASSERT
4963   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4964   if (CheckCompressedOops) {
4965     Label ok;
4966     testq(src, src);
4967     jcc(Assembler::notEqual, ok);
4968     STOP("null oop passed to encode_heap_oop_not_null2");
4969     bind(ok);
4970   }
4971 #endif
4972   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4973   if (dst != src) {
4974     movq(dst, src);
4975   }
4976   if (CompressedOops::base() != NULL) {
4977     subq(dst, r12_heapbase);
4978   }
4979   if (CompressedOops::shift() != 0) {
4980     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4981     shrq(dst, LogMinObjAlignmentInBytes);
4982   }
4983 }
4984 
4985 void  MacroAssembler::decode_heap_oop(Register r) {
4986 #ifdef ASSERT
4987   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4988 #endif
4989   if (CompressedOops::base() == NULL) {
4990     if (CompressedOops::shift() != 0) {
4991       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4992       shlq(r, LogMinObjAlignmentInBytes);
4993     }
4994   } else {
4995     Label done;
4996     shlq(r, LogMinObjAlignmentInBytes);
4997     jccb(Assembler::equal, done);
4998     addq(r, r12_heapbase);
4999     bind(done);
5000   }
5001   verify_oop_msg(r, "broken oop in decode_heap_oop");
5002 }
5003 
5004 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5005   // Note: it will change flags
5006   assert (UseCompressedOops, "should only be used for compressed headers");
5007   assert (Universe::heap() != NULL, "java heap should be initialized");
5008   // Cannot assert, unverified entry point counts instructions (see .ad file)
5009   // vtableStubs also counts instructions in pd_code_size_limit.
5010   // Also do not verify_oop as this is called by verify_oop.
5011   if (CompressedOops::shift() != 0) {
5012     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5013     shlq(r, LogMinObjAlignmentInBytes);
5014     if (CompressedOops::base() != NULL) {
5015       addq(r, r12_heapbase);
5016     }
5017   } else {
5018     assert (CompressedOops::base() == NULL, "sanity");
5019   }
5020 }
5021 
5022 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5023   // Note: it will change flags
5024   assert (UseCompressedOops, "should only be used for compressed headers");
5025   assert (Universe::heap() != NULL, "java heap should be initialized");
5026   // Cannot assert, unverified entry point counts instructions (see .ad file)
5027   // vtableStubs also counts instructions in pd_code_size_limit.
5028   // Also do not verify_oop as this is called by verify_oop.
5029   if (CompressedOops::shift() != 0) {
5030     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5031     if (LogMinObjAlignmentInBytes == Address::times_8) {
5032       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5033     } else {
5034       if (dst != src) {
5035         movq(dst, src);
5036       }
5037       shlq(dst, LogMinObjAlignmentInBytes);
5038       if (CompressedOops::base() != NULL) {
5039         addq(dst, r12_heapbase);
5040       }
5041     }
5042   } else {
5043     assert (CompressedOops::base() == NULL, "sanity");
5044     if (dst != src) {
5045       movq(dst, src);
5046     }
5047   }
5048 }
5049 
5050 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5051   assert_different_registers(r, tmp);
5052   if (CompressedKlassPointers::base() != NULL) {
5053     mov64(tmp, (int64_t)CompressedKlassPointers::base());
5054     subq(r, tmp);
5055   }
5056   if (CompressedKlassPointers::shift() != 0) {
5057     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5058     shrq(r, LogKlassAlignmentInBytes);
5059   }
5060 }
5061 
5062 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5063   assert_different_registers(src, dst);
5064   if (CompressedKlassPointers::base() != NULL) {
5065     mov64(dst, -(int64_t)CompressedKlassPointers::base());
5066     addq(dst, src);
5067   } else {
5068     movptr(dst, src);
5069   }
5070   if (CompressedKlassPointers::shift() != 0) {
5071     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5072     shrq(dst, LogKlassAlignmentInBytes);
5073   }
5074 }
5075 
5076 // !!! If the instructions that get generated here change then function
5077 // instr_size_for_decode_klass_not_null() needs to get updated.
5078 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5079   assert_different_registers(r, tmp);
5080   // Note: it will change flags
5081   assert(UseCompressedClassPointers, "should only be used for compressed headers");
5082   // Cannot assert, unverified entry point counts instructions (see .ad file)
5083   // vtableStubs also counts instructions in pd_code_size_limit.
5084   // Also do not verify_oop as this is called by verify_oop.
5085   if (CompressedKlassPointers::shift() != 0) {
5086     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5087     shlq(r, LogKlassAlignmentInBytes);
5088   }
5089   if (CompressedKlassPointers::base() != NULL) {
5090     mov64(tmp, (int64_t)CompressedKlassPointers::base());
5091     addq(r, tmp);
5092   }
5093 }
5094 
5095 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5096   assert_different_registers(src, dst);
5097   // Note: it will change flags
5098   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5099   // Cannot assert, unverified entry point counts instructions (see .ad file)
5100   // vtableStubs also counts instructions in pd_code_size_limit.
5101   // Also do not verify_oop as this is called by verify_oop.
5102 
5103   if (CompressedKlassPointers::base() == NULL &&
5104       CompressedKlassPointers::shift() == 0) {
5105     // The best case scenario is that there is no base or shift. Then it is already
5106     // a pointer that needs nothing but a register rename.
5107     movl(dst, src);
5108   } else {
5109     if (CompressedKlassPointers::base() != NULL) {
5110       mov64(dst, (int64_t)CompressedKlassPointers::base());
5111     } else {
5112       xorq(dst, dst);
5113     }
5114     if (CompressedKlassPointers::shift() != 0) {
5115       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5116       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5117       leaq(dst, Address(dst, src, Address::times_8, 0));
5118     } else {
5119       addq(dst, src);
5120     }
5121   }
5122 }
5123 
5124 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5125   assert (UseCompressedOops, "should only be used for compressed headers");
5126   assert (Universe::heap() != NULL, "java heap should be initialized");
5127   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5128   int oop_index = oop_recorder()->find_index(obj);
5129   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5130   mov_narrow_oop(dst, oop_index, rspec);
5131 }
5132 
5133 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5134   assert (UseCompressedOops, "should only be used for compressed headers");
5135   assert (Universe::heap() != NULL, "java heap should be initialized");
5136   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5137   int oop_index = oop_recorder()->find_index(obj);
5138   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5139   mov_narrow_oop(dst, oop_index, rspec);
5140 }
5141 
5142 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5143   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5144   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5145   int klass_index = oop_recorder()->find_index(k);
5146   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5147   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5148 }
5149 
5150 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5151   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5152   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5153   int klass_index = oop_recorder()->find_index(k);
5154   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5155   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5156 }
5157 
5158 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5159   assert (UseCompressedOops, "should only be used for compressed headers");
5160   assert (Universe::heap() != NULL, "java heap should be initialized");
5161   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5162   int oop_index = oop_recorder()->find_index(obj);
5163   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5164   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5165 }
5166 
5167 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5168   assert (UseCompressedOops, "should only be used for compressed headers");
5169   assert (Universe::heap() != NULL, "java heap should be initialized");
5170   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5171   int oop_index = oop_recorder()->find_index(obj);
5172   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5173   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5174 }
5175 
5176 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5177   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5178   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5179   int klass_index = oop_recorder()->find_index(k);
5180   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5181   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5182 }
5183 
5184 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5185   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5186   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5187   int klass_index = oop_recorder()->find_index(k);
5188   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5189   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5190 }
5191 
5192 void MacroAssembler::reinit_heapbase() {
5193   if (UseCompressedOops) {
5194     if (Universe::heap() != NULL) {
5195       if (CompressedOops::base() == NULL) {
5196         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5197       } else {
5198         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5199       }
5200     } else {
5201       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5202     }
5203   }
5204 }
5205 
5206 #endif // _LP64
5207 
5208 // C2 compiled method's prolog code.
5209 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5210 
5211   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5212   // NativeJump::patch_verified_entry will be able to patch out the entry
5213   // code safely. The push to verify stack depth is ok at 5 bytes,
5214   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5215   // stack bang then we must use the 6 byte frame allocation even if
5216   // we have no frame. :-(
5217   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5218 
5219   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5220   // Remove word for return addr
5221   framesize -= wordSize;
5222   stack_bang_size -= wordSize;
5223 
5224   // Calls to C2R adapters often do not accept exceptional returns.
5225   // We require that their callers must bang for them.  But be careful, because
5226   // some VM calls (such as call site linkage) can use several kilobytes of
5227   // stack.  But the stack safety zone should account for that.
5228   // See bugs 4446381, 4468289, 4497237.
5229   if (stack_bang_size > 0) {
5230     generate_stack_overflow_check(stack_bang_size);
5231 
5232     // We always push rbp, so that on return to interpreter rbp, will be
5233     // restored correctly and we can correct the stack.
5234     push(rbp);
5235     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5236     if (PreserveFramePointer) {
5237       mov(rbp, rsp);
5238     }
5239     // Remove word for ebp
5240     framesize -= wordSize;
5241 
5242     // Create frame
5243     if (framesize) {
5244       subptr(rsp, framesize);
5245     }
5246   } else {
5247     // Create frame (force generation of a 4 byte immediate value)
5248     subptr_imm32(rsp, framesize);
5249 
5250     // Save RBP register now.
5251     framesize -= wordSize;
5252     movptr(Address(rsp, framesize), rbp);
5253     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5254     if (PreserveFramePointer) {
5255       movptr(rbp, rsp);
5256       if (framesize > 0) {
5257         addptr(rbp, framesize);
5258       }
5259     }
5260   }
5261 
5262   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5263     framesize -= wordSize;
5264     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5265   }
5266 
5267 #ifndef _LP64
5268   // If method sets FPU control word do it now
5269   if (fp_mode_24b) {
5270     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
5271   }
5272   if (UseSSE >= 2 && VerifyFPU) {
5273     verify_FPU(0, "FPU stack must be clean on entry");
5274   }
5275 #endif
5276 
5277 #ifdef ASSERT
5278   if (VerifyStackAtCalls) {
5279     Label L;
5280     push(rax);
5281     mov(rax, rsp);
5282     andptr(rax, StackAlignmentInBytes-1);
5283     cmpptr(rax, StackAlignmentInBytes-wordSize);
5284     pop(rax);
5285     jcc(Assembler::equal, L);
5286     STOP("Stack is not properly aligned!");
5287     bind(L);
5288   }
5289 #endif
5290 
5291   if (!is_stub) {
5292     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5293     bs->nmethod_entry_barrier(this);
5294   }
5295 }
5296 
5297 #if COMPILER2_OR_JVMCI
5298 
5299 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5300 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5301   // cnt - number of qwords (8-byte words).
5302   // base - start address, qword aligned.
5303   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5304   bool use64byteVector = MaxVectorSize == 64 && AVX3Threshold == 0;
5305   if (use64byteVector) {
5306     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5307   } else if (MaxVectorSize >= 32) {
5308     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5309   } else {
5310     pxor(xtmp, xtmp);
5311   }
5312   jmp(L_zero_64_bytes);
5313 
5314   BIND(L_loop);
5315   if (MaxVectorSize >= 32) {
5316     fill64_avx(base, 0, xtmp, use64byteVector);
5317   } else {
5318     movdqu(Address(base,  0), xtmp);
5319     movdqu(Address(base, 16), xtmp);
5320     movdqu(Address(base, 32), xtmp);
5321     movdqu(Address(base, 48), xtmp);
5322   }
5323   addptr(base, 64);
5324 
5325   BIND(L_zero_64_bytes);
5326   subptr(cnt, 8);
5327   jccb(Assembler::greaterEqual, L_loop);
5328 
5329   // Copy trailing 64 bytes
5330   if (use64byteVector) {
5331     addptr(cnt, 8);
5332     jccb(Assembler::equal, L_end);
5333     fill64_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp, true);
5334     jmp(L_end);
5335   } else {
5336     addptr(cnt, 4);
5337     jccb(Assembler::less, L_tail);
5338     if (MaxVectorSize >= 32) {
5339       vmovdqu(Address(base, 0), xtmp);
5340     } else {
5341       movdqu(Address(base,  0), xtmp);
5342       movdqu(Address(base, 16), xtmp);
5343     }
5344   }
5345   addptr(base, 32);
5346   subptr(cnt, 4);
5347 
5348   BIND(L_tail);
5349   addptr(cnt, 4);
5350   jccb(Assembler::lessEqual, L_end);
5351   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5352     fill32_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp);
5353   } else {
5354     decrement(cnt);
5355 
5356     BIND(L_sloop);
5357     movq(Address(base, 0), xtmp);
5358     addptr(base, 8);
5359     decrement(cnt);
5360     jccb(Assembler::greaterEqual, L_sloop);
5361   }
5362   BIND(L_end);
5363 }
5364 
5365 // Clearing constant sized memory using YMM/ZMM registers.
5366 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5367   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5368   bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
5369 
5370   int vector64_count = (cnt & (~0x7)) >> 3;
5371   cnt = cnt & 0x7;
5372   const int fill64_per_loop = 4;
5373   const int max_unrolled_fill64 = 8;
5374 
5375   // 64 byte initialization loop.
5376   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5377   int start64 = 0;
5378   if (vector64_count > max_unrolled_fill64) {
5379     Label LOOP;
5380     Register index = rtmp;
5381 
5382     start64 = vector64_count - (vector64_count % fill64_per_loop);
5383 
5384     movl(index, 0);
5385     BIND(LOOP);
5386     for (int i = 0; i < fill64_per_loop; i++) {
5387       fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5388     }
5389     addl(index, fill64_per_loop * 64);
5390     cmpl(index, start64 * 64);
5391     jccb(Assembler::less, LOOP);
5392   }
5393   for (int i = start64; i < vector64_count; i++) {
5394     fill64_avx(base, i * 64, xtmp, use64byteVector);
5395   }
5396 
5397   // Clear remaining 64 byte tail.
5398   int disp = vector64_count * 64;
5399   if (cnt) {
5400     switch (cnt) {
5401       case 1:
5402         movq(Address(base, disp), xtmp);
5403         break;
5404       case 2:
5405         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit);
5406         break;
5407       case 3:
5408         movl(rtmp, 0x7);
5409         kmovwl(mask, rtmp);
5410         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit);
5411         break;
5412       case 4:
5413         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5414         break;
5415       case 5:
5416         if (use64byteVector) {
5417           movl(rtmp, 0x1F);
5418           kmovwl(mask, rtmp);
5419           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5420         } else {
5421           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5422           movq(Address(base, disp + 32), xtmp);
5423         }
5424         break;
5425       case 6:
5426         if (use64byteVector) {
5427           movl(rtmp, 0x3F);
5428           kmovwl(mask, rtmp);
5429           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5430         } else {
5431           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5432           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit);
5433         }
5434         break;
5435       case 7:
5436         if (use64byteVector) {
5437           movl(rtmp, 0x7F);
5438           kmovwl(mask, rtmp);
5439           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5440         } else {
5441           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5442           movl(rtmp, 0x7);
5443           kmovwl(mask, rtmp);
5444           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit);
5445         }
5446         break;
5447       default:
5448         fatal("Unexpected length : %d\n",cnt);
5449         break;
5450     }
5451   }
5452 }
5453 
5454 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5455                                bool is_large, KRegister mask) {
5456   // cnt      - number of qwords (8-byte words).
5457   // base     - start address, qword aligned.
5458   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5459   assert(base==rdi, "base register must be edi for rep stos");
5460   assert(tmp==rax,   "tmp register must be eax for rep stos");
5461   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5462   assert(InitArrayShortSize % BytesPerLong == 0,
5463     "InitArrayShortSize should be the multiple of BytesPerLong");
5464 
5465   Label DONE;
5466   if (!is_large || !UseXMMForObjInit) {
5467     xorptr(tmp, tmp);
5468   }
5469 
5470   if (!is_large) {
5471     Label LOOP, LONG;
5472     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5473     jccb(Assembler::greater, LONG);
5474 
5475     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5476 
5477     decrement(cnt);
5478     jccb(Assembler::negative, DONE); // Zero length
5479 
5480     // Use individual pointer-sized stores for small counts:
5481     BIND(LOOP);
5482     movptr(Address(base, cnt, Address::times_ptr), tmp);
5483     decrement(cnt);
5484     jccb(Assembler::greaterEqual, LOOP);
5485     jmpb(DONE);
5486 
5487     BIND(LONG);
5488   }
5489 
5490   // Use longer rep-prefixed ops for non-small counts:
5491   if (UseFastStosb) {
5492     shlptr(cnt, 3); // convert to number of bytes
5493     rep_stosb();
5494   } else if (UseXMMForObjInit) {
5495     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5496   } else {
5497     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5498     rep_stos();
5499   }
5500 
5501   BIND(DONE);
5502 }
5503 
5504 #endif //COMPILER2_OR_JVMCI
5505 
5506 
5507 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5508                                    Register to, Register value, Register count,
5509                                    Register rtmp, XMMRegister xtmp) {
5510   ShortBranchVerifier sbv(this);
5511   assert_different_registers(to, value, count, rtmp);
5512   Label L_exit;
5513   Label L_fill_2_bytes, L_fill_4_bytes;
5514 
5515   int shift = -1;
5516   switch (t) {
5517     case T_BYTE:
5518       shift = 2;
5519       break;
5520     case T_SHORT:
5521       shift = 1;
5522       break;
5523     case T_INT:
5524       shift = 0;
5525       break;
5526     default: ShouldNotReachHere();
5527   }
5528 
5529   if (t == T_BYTE) {
5530     andl(value, 0xff);
5531     movl(rtmp, value);
5532     shll(rtmp, 8);
5533     orl(value, rtmp);
5534   }
5535   if (t == T_SHORT) {
5536     andl(value, 0xffff);
5537   }
5538   if (t == T_BYTE || t == T_SHORT) {
5539     movl(rtmp, value);
5540     shll(rtmp, 16);
5541     orl(value, rtmp);
5542   }
5543 
5544   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5545   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5546   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5547     Label L_skip_align2;
5548     // align source address at 4 bytes address boundary
5549     if (t == T_BYTE) {
5550       Label L_skip_align1;
5551       // One byte misalignment happens only for byte arrays
5552       testptr(to, 1);
5553       jccb(Assembler::zero, L_skip_align1);
5554       movb(Address(to, 0), value);
5555       increment(to);
5556       decrement(count);
5557       BIND(L_skip_align1);
5558     }
5559     // Two bytes misalignment happens only for byte and short (char) arrays
5560     testptr(to, 2);
5561     jccb(Assembler::zero, L_skip_align2);
5562     movw(Address(to, 0), value);
5563     addptr(to, 2);
5564     subl(count, 1<<(shift-1));
5565     BIND(L_skip_align2);
5566   }
5567   if (UseSSE < 2) {
5568     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5569     // Fill 32-byte chunks
5570     subl(count, 8 << shift);
5571     jcc(Assembler::less, L_check_fill_8_bytes);
5572     align(16);
5573 
5574     BIND(L_fill_32_bytes_loop);
5575 
5576     for (int i = 0; i < 32; i += 4) {
5577       movl(Address(to, i), value);
5578     }
5579 
5580     addptr(to, 32);
5581     subl(count, 8 << shift);
5582     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5583     BIND(L_check_fill_8_bytes);
5584     addl(count, 8 << shift);
5585     jccb(Assembler::zero, L_exit);
5586     jmpb(L_fill_8_bytes);
5587 
5588     //
5589     // length is too short, just fill qwords
5590     //
5591     BIND(L_fill_8_bytes_loop);
5592     movl(Address(to, 0), value);
5593     movl(Address(to, 4), value);
5594     addptr(to, 8);
5595     BIND(L_fill_8_bytes);
5596     subl(count, 1 << (shift + 1));
5597     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5598     // fall through to fill 4 bytes
5599   } else {
5600     Label L_fill_32_bytes;
5601     if (!UseUnalignedLoadStores) {
5602       // align to 8 bytes, we know we are 4 byte aligned to start
5603       testptr(to, 4);
5604       jccb(Assembler::zero, L_fill_32_bytes);
5605       movl(Address(to, 0), value);
5606       addptr(to, 4);
5607       subl(count, 1<<shift);
5608     }
5609     BIND(L_fill_32_bytes);
5610     {
5611       assert( UseSSE >= 2, "supported cpu only" );
5612       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5613       movdl(xtmp, value);
5614       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5615         Label L_check_fill_32_bytes;
5616         if (UseAVX > 2) {
5617           // Fill 64-byte chunks
5618           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5619 
5620           // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
5621           cmpl(count, AVX3Threshold);
5622           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5623 
5624           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5625 
5626           subl(count, 16 << shift);
5627           jccb(Assembler::less, L_check_fill_32_bytes);
5628           align(16);
5629 
5630           BIND(L_fill_64_bytes_loop_avx3);
5631           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5632           addptr(to, 64);
5633           subl(count, 16 << shift);
5634           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5635           jmpb(L_check_fill_32_bytes);
5636 
5637           BIND(L_check_fill_64_bytes_avx2);
5638         }
5639         // Fill 64-byte chunks
5640         Label L_fill_64_bytes_loop;
5641         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5642 
5643         subl(count, 16 << shift);
5644         jcc(Assembler::less, L_check_fill_32_bytes);
5645         align(16);
5646 
5647         BIND(L_fill_64_bytes_loop);
5648         vmovdqu(Address(to, 0), xtmp);
5649         vmovdqu(Address(to, 32), xtmp);
5650         addptr(to, 64);
5651         subl(count, 16 << shift);
5652         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5653 
5654         BIND(L_check_fill_32_bytes);
5655         addl(count, 8 << shift);
5656         jccb(Assembler::less, L_check_fill_8_bytes);
5657         vmovdqu(Address(to, 0), xtmp);
5658         addptr(to, 32);
5659         subl(count, 8 << shift);
5660 
5661         BIND(L_check_fill_8_bytes);
5662         // clean upper bits of YMM registers
5663         movdl(xtmp, value);
5664         pshufd(xtmp, xtmp, 0);
5665       } else {
5666         // Fill 32-byte chunks
5667         pshufd(xtmp, xtmp, 0);
5668 
5669         subl(count, 8 << shift);
5670         jcc(Assembler::less, L_check_fill_8_bytes);
5671         align(16);
5672 
5673         BIND(L_fill_32_bytes_loop);
5674 
5675         if (UseUnalignedLoadStores) {
5676           movdqu(Address(to, 0), xtmp);
5677           movdqu(Address(to, 16), xtmp);
5678         } else {
5679           movq(Address(to, 0), xtmp);
5680           movq(Address(to, 8), xtmp);
5681           movq(Address(to, 16), xtmp);
5682           movq(Address(to, 24), xtmp);
5683         }
5684 
5685         addptr(to, 32);
5686         subl(count, 8 << shift);
5687         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5688 
5689         BIND(L_check_fill_8_bytes);
5690       }
5691       addl(count, 8 << shift);
5692       jccb(Assembler::zero, L_exit);
5693       jmpb(L_fill_8_bytes);
5694 
5695       //
5696       // length is too short, just fill qwords
5697       //
5698       BIND(L_fill_8_bytes_loop);
5699       movq(Address(to, 0), xtmp);
5700       addptr(to, 8);
5701       BIND(L_fill_8_bytes);
5702       subl(count, 1 << (shift + 1));
5703       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5704     }
5705   }
5706   // fill trailing 4 bytes
5707   BIND(L_fill_4_bytes);
5708   testl(count, 1<<shift);
5709   jccb(Assembler::zero, L_fill_2_bytes);
5710   movl(Address(to, 0), value);
5711   if (t == T_BYTE || t == T_SHORT) {
5712     Label L_fill_byte;
5713     addptr(to, 4);
5714     BIND(L_fill_2_bytes);
5715     // fill trailing 2 bytes
5716     testl(count, 1<<(shift-1));
5717     jccb(Assembler::zero, L_fill_byte);
5718     movw(Address(to, 0), value);
5719     if (t == T_BYTE) {
5720       addptr(to, 2);
5721       BIND(L_fill_byte);
5722       // fill trailing byte
5723       testl(count, 1);
5724       jccb(Assembler::zero, L_exit);
5725       movb(Address(to, 0), value);
5726     } else {
5727       BIND(L_fill_byte);
5728     }
5729   } else {
5730     BIND(L_fill_2_bytes);
5731   }
5732   BIND(L_exit);
5733 }
5734 
5735 // encode char[] to byte[] in ISO_8859_1 or ASCII
5736    //@IntrinsicCandidate
5737    //private static int implEncodeISOArray(byte[] sa, int sp,
5738    //byte[] da, int dp, int len) {
5739    //  int i = 0;
5740    //  for (; i < len; i++) {
5741    //    char c = StringUTF16.getChar(sa, sp++);
5742    //    if (c > '\u00FF')
5743    //      break;
5744    //    da[dp++] = (byte)c;
5745    //  }
5746    //  return i;
5747    //}
5748    //
5749    //@IntrinsicCandidate
5750    //private static int implEncodeAsciiArray(char[] sa, int sp,
5751    //    byte[] da, int dp, int len) {
5752    //  int i = 0;
5753    //  for (; i < len; i++) {
5754    //    char c = sa[sp++];
5755    //    if (c >= '\u0080')
5756    //      break;
5757    //    da[dp++] = (byte)c;
5758    //  }
5759    //  return i;
5760    //}
5761 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5762   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5763   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5764   Register tmp5, Register result, bool ascii) {
5765 
5766   // rsi: src
5767   // rdi: dst
5768   // rdx: len
5769   // rcx: tmp5
5770   // rax: result
5771   ShortBranchVerifier sbv(this);
5772   assert_different_registers(src, dst, len, tmp5, result);
5773   Label L_done, L_copy_1_char, L_copy_1_char_exit;
5774 
5775   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5776   int short_mask = ascii ? 0xff80 : 0xff00;
5777 
5778   // set result
5779   xorl(result, result);
5780   // check for zero length
5781   testl(len, len);
5782   jcc(Assembler::zero, L_done);
5783 
5784   movl(result, len);
5785 
5786   // Setup pointers
5787   lea(src, Address(src, len, Address::times_2)); // char[]
5788   lea(dst, Address(dst, len, Address::times_1)); // byte[]
5789   negptr(len);
5790 
5791   if (UseSSE42Intrinsics || UseAVX >= 2) {
5792     Label L_copy_8_chars, L_copy_8_chars_exit;
5793     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5794 
5795     if (UseAVX >= 2) {
5796       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5797       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5798       movdl(tmp1Reg, tmp5);
5799       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5800       jmp(L_chars_32_check);
5801 
5802       bind(L_copy_32_chars);
5803       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5804       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5805       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5806       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5807       jccb(Assembler::notZero, L_copy_32_chars_exit);
5808       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5809       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5810       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5811 
5812       bind(L_chars_32_check);
5813       addptr(len, 32);
5814       jcc(Assembler::lessEqual, L_copy_32_chars);
5815 
5816       bind(L_copy_32_chars_exit);
5817       subptr(len, 16);
5818       jccb(Assembler::greater, L_copy_16_chars_exit);
5819 
5820     } else if (UseSSE42Intrinsics) {
5821       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5822       movdl(tmp1Reg, tmp5);
5823       pshufd(tmp1Reg, tmp1Reg, 0);
5824       jmpb(L_chars_16_check);
5825     }
5826 
5827     bind(L_copy_16_chars);
5828     if (UseAVX >= 2) {
5829       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5830       vptest(tmp2Reg, tmp1Reg);
5831       jcc(Assembler::notZero, L_copy_16_chars_exit);
5832       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5833       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5834     } else {
5835       if (UseAVX > 0) {
5836         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5837         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5838         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5839       } else {
5840         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5841         por(tmp2Reg, tmp3Reg);
5842         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5843         por(tmp2Reg, tmp4Reg);
5844       }
5845       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5846       jccb(Assembler::notZero, L_copy_16_chars_exit);
5847       packuswb(tmp3Reg, tmp4Reg);
5848     }
5849     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5850 
5851     bind(L_chars_16_check);
5852     addptr(len, 16);
5853     jcc(Assembler::lessEqual, L_copy_16_chars);
5854 
5855     bind(L_copy_16_chars_exit);
5856     if (UseAVX >= 2) {
5857       // clean upper bits of YMM registers
5858       vpxor(tmp2Reg, tmp2Reg);
5859       vpxor(tmp3Reg, tmp3Reg);
5860       vpxor(tmp4Reg, tmp4Reg);
5861       movdl(tmp1Reg, tmp5);
5862       pshufd(tmp1Reg, tmp1Reg, 0);
5863     }
5864     subptr(len, 8);
5865     jccb(Assembler::greater, L_copy_8_chars_exit);
5866 
5867     bind(L_copy_8_chars);
5868     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5869     ptest(tmp3Reg, tmp1Reg);
5870     jccb(Assembler::notZero, L_copy_8_chars_exit);
5871     packuswb(tmp3Reg, tmp1Reg);
5872     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5873     addptr(len, 8);
5874     jccb(Assembler::lessEqual, L_copy_8_chars);
5875 
5876     bind(L_copy_8_chars_exit);
5877     subptr(len, 8);
5878     jccb(Assembler::zero, L_done);
5879   }
5880 
5881   bind(L_copy_1_char);
5882   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5883   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
5884   jccb(Assembler::notZero, L_copy_1_char_exit);
5885   movb(Address(dst, len, Address::times_1, 0), tmp5);
5886   addptr(len, 1);
5887   jccb(Assembler::less, L_copy_1_char);
5888 
5889   bind(L_copy_1_char_exit);
5890   addptr(result, len); // len is negative count of not processed elements
5891 
5892   bind(L_done);
5893 }
5894 
5895 #ifdef _LP64
5896 /**
5897  * Helper for multiply_to_len().
5898  */
5899 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5900   addq(dest_lo, src1);
5901   adcq(dest_hi, 0);
5902   addq(dest_lo, src2);
5903   adcq(dest_hi, 0);
5904 }
5905 
5906 /**
5907  * Multiply 64 bit by 64 bit first loop.
5908  */
5909 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5910                                            Register y, Register y_idx, Register z,
5911                                            Register carry, Register product,
5912                                            Register idx, Register kdx) {
5913   //
5914   //  jlong carry, x[], y[], z[];
5915   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5916   //    huge_128 product = y[idx] * x[xstart] + carry;
5917   //    z[kdx] = (jlong)product;
5918   //    carry  = (jlong)(product >>> 64);
5919   //  }
5920   //  z[xstart] = carry;
5921   //
5922 
5923   Label L_first_loop, L_first_loop_exit;
5924   Label L_one_x, L_one_y, L_multiply;
5925 
5926   decrementl(xstart);
5927   jcc(Assembler::negative, L_one_x);
5928 
5929   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
5930   rorq(x_xstart, 32); // convert big-endian to little-endian
5931 
5932   bind(L_first_loop);
5933   decrementl(idx);
5934   jcc(Assembler::negative, L_first_loop_exit);
5935   decrementl(idx);
5936   jcc(Assembler::negative, L_one_y);
5937   movq(y_idx, Address(y, idx, Address::times_4,  0));
5938   rorq(y_idx, 32); // convert big-endian to little-endian
5939   bind(L_multiply);
5940   movq(product, x_xstart);
5941   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5942   addq(product, carry);
5943   adcq(rdx, 0);
5944   subl(kdx, 2);
5945   movl(Address(z, kdx, Address::times_4,  4), product);
5946   shrq(product, 32);
5947   movl(Address(z, kdx, Address::times_4,  0), product);
5948   movq(carry, rdx);
5949   jmp(L_first_loop);
5950 
5951   bind(L_one_y);
5952   movl(y_idx, Address(y,  0));
5953   jmp(L_multiply);
5954 
5955   bind(L_one_x);
5956   movl(x_xstart, Address(x,  0));
5957   jmp(L_first_loop);
5958 
5959   bind(L_first_loop_exit);
5960 }
5961 
5962 /**
5963  * Multiply 64 bit by 64 bit and add 128 bit.
5964  */
5965 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5966                                             Register yz_idx, Register idx,
5967                                             Register carry, Register product, int offset) {
5968   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5969   //     z[kdx] = (jlong)product;
5970 
5971   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
5972   rorq(yz_idx, 32); // convert big-endian to little-endian
5973   movq(product, x_xstart);
5974   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
5975   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
5976   rorq(yz_idx, 32); // convert big-endian to little-endian
5977 
5978   add2_with_carry(rdx, product, carry, yz_idx);
5979 
5980   movl(Address(z, idx, Address::times_4,  offset+4), product);
5981   shrq(product, 32);
5982   movl(Address(z, idx, Address::times_4,  offset), product);
5983 
5984 }
5985 
5986 /**
5987  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5988  */
5989 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5990                                              Register yz_idx, Register idx, Register jdx,
5991                                              Register carry, Register product,
5992                                              Register carry2) {
5993   //   jlong carry, x[], y[], z[];
5994   //   int kdx = ystart+1;
5995   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5996   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5997   //     z[kdx+idx+1] = (jlong)product;
5998   //     jlong carry2  = (jlong)(product >>> 64);
5999   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6000   //     z[kdx+idx] = (jlong)product;
6001   //     carry  = (jlong)(product >>> 64);
6002   //   }
6003   //   idx += 2;
6004   //   if (idx > 0) {
6005   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6006   //     z[kdx+idx] = (jlong)product;
6007   //     carry  = (jlong)(product >>> 64);
6008   //   }
6009   //
6010 
6011   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6012 
6013   movl(jdx, idx);
6014   andl(jdx, 0xFFFFFFFC);
6015   shrl(jdx, 2);
6016 
6017   bind(L_third_loop);
6018   subl(jdx, 1);
6019   jcc(Assembler::negative, L_third_loop_exit);
6020   subl(idx, 4);
6021 
6022   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6023   movq(carry2, rdx);
6024 
6025   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6026   movq(carry, rdx);
6027   jmp(L_third_loop);
6028 
6029   bind (L_third_loop_exit);
6030 
6031   andl (idx, 0x3);
6032   jcc(Assembler::zero, L_post_third_loop_done);
6033 
6034   Label L_check_1;
6035   subl(idx, 2);
6036   jcc(Assembler::negative, L_check_1);
6037 
6038   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6039   movq(carry, rdx);
6040 
6041   bind (L_check_1);
6042   addl (idx, 0x2);
6043   andl (idx, 0x1);
6044   subl(idx, 1);
6045   jcc(Assembler::negative, L_post_third_loop_done);
6046 
6047   movl(yz_idx, Address(y, idx, Address::times_4,  0));
6048   movq(product, x_xstart);
6049   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6050   movl(yz_idx, Address(z, idx, Address::times_4,  0));
6051 
6052   add2_with_carry(rdx, product, yz_idx, carry);
6053 
6054   movl(Address(z, idx, Address::times_4,  0), product);
6055   shrq(product, 32);
6056 
6057   shlq(rdx, 32);
6058   orq(product, rdx);
6059   movq(carry, product);
6060 
6061   bind(L_post_third_loop_done);
6062 }
6063 
6064 /**
6065  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6066  *
6067  */
6068 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6069                                                   Register carry, Register carry2,
6070                                                   Register idx, Register jdx,
6071                                                   Register yz_idx1, Register yz_idx2,
6072                                                   Register tmp, Register tmp3, Register tmp4) {
6073   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6074 
6075   //   jlong carry, x[], y[], z[];
6076   //   int kdx = ystart+1;
6077   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6078   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6079   //     jlong carry2  = (jlong)(tmp3 >>> 64);
6080   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
6081   //     carry  = (jlong)(tmp4 >>> 64);
6082   //     z[kdx+idx+1] = (jlong)tmp3;
6083   //     z[kdx+idx] = (jlong)tmp4;
6084   //   }
6085   //   idx += 2;
6086   //   if (idx > 0) {
6087   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6088   //     z[kdx+idx] = (jlong)yz_idx1;
6089   //     carry  = (jlong)(yz_idx1 >>> 64);
6090   //   }
6091   //
6092 
6093   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6094 
6095   movl(jdx, idx);
6096   andl(jdx, 0xFFFFFFFC);
6097   shrl(jdx, 2);
6098 
6099   bind(L_third_loop);
6100   subl(jdx, 1);
6101   jcc(Assembler::negative, L_third_loop_exit);
6102   subl(idx, 4);
6103 
6104   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
6105   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6106   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
6107   rorxq(yz_idx2, yz_idx2, 32);
6108 
6109   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
6110   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
6111 
6112   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
6113   rorxq(yz_idx1, yz_idx1, 32);
6114   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6115   rorxq(yz_idx2, yz_idx2, 32);
6116 
6117   if (VM_Version::supports_adx()) {
6118     adcxq(tmp3, carry);
6119     adoxq(tmp3, yz_idx1);
6120 
6121     adcxq(tmp4, tmp);
6122     adoxq(tmp4, yz_idx2);
6123 
6124     movl(carry, 0); // does not affect flags
6125     adcxq(carry2, carry);
6126     adoxq(carry2, carry);
6127   } else {
6128     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6129     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6130   }
6131   movq(carry, carry2);
6132 
6133   movl(Address(z, idx, Address::times_4, 12), tmp3);
6134   shrq(tmp3, 32);
6135   movl(Address(z, idx, Address::times_4,  8), tmp3);
6136 
6137   movl(Address(z, idx, Address::times_4,  4), tmp4);
6138   shrq(tmp4, 32);
6139   movl(Address(z, idx, Address::times_4,  0), tmp4);
6140 
6141   jmp(L_third_loop);
6142 
6143   bind (L_third_loop_exit);
6144 
6145   andl (idx, 0x3);
6146   jcc(Assembler::zero, L_post_third_loop_done);
6147 
6148   Label L_check_1;
6149   subl(idx, 2);
6150   jcc(Assembler::negative, L_check_1);
6151 
6152   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
6153   rorxq(yz_idx1, yz_idx1, 32);
6154   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
6155   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6156   rorxq(yz_idx2, yz_idx2, 32);
6157 
6158   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6159 
6160   movl(Address(z, idx, Address::times_4,  4), tmp3);
6161   shrq(tmp3, 32);
6162   movl(Address(z, idx, Address::times_4,  0), tmp3);
6163   movq(carry, tmp4);
6164 
6165   bind (L_check_1);
6166   addl (idx, 0x2);
6167   andl (idx, 0x1);
6168   subl(idx, 1);
6169   jcc(Assembler::negative, L_post_third_loop_done);
6170   movl(tmp4, Address(y, idx, Address::times_4,  0));
6171   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
6172   movl(tmp4, Address(z, idx, Address::times_4,  0));
6173 
6174   add2_with_carry(carry2, tmp3, tmp4, carry);
6175 
6176   movl(Address(z, idx, Address::times_4,  0), tmp3);
6177   shrq(tmp3, 32);
6178 
6179   shlq(carry2, 32);
6180   orq(tmp3, carry2);
6181   movq(carry, tmp3);
6182 
6183   bind(L_post_third_loop_done);
6184 }
6185 
6186 /**
6187  * Code for BigInteger::multiplyToLen() instrinsic.
6188  *
6189  * rdi: x
6190  * rax: xlen
6191  * rsi: y
6192  * rcx: ylen
6193  * r8:  z
6194  * r11: zlen
6195  * r12: tmp1
6196  * r13: tmp2
6197  * r14: tmp3
6198  * r15: tmp4
6199  * rbx: tmp5
6200  *
6201  */
6202 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6203                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6204   ShortBranchVerifier sbv(this);
6205   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6206 
6207   push(tmp1);
6208   push(tmp2);
6209   push(tmp3);
6210   push(tmp4);
6211   push(tmp5);
6212 
6213   push(xlen);
6214   push(zlen);
6215 
6216   const Register idx = tmp1;
6217   const Register kdx = tmp2;
6218   const Register xstart = tmp3;
6219 
6220   const Register y_idx = tmp4;
6221   const Register carry = tmp5;
6222   const Register product  = xlen;
6223   const Register x_xstart = zlen;  // reuse register
6224 
6225   // First Loop.
6226   //
6227   //  final static long LONG_MASK = 0xffffffffL;
6228   //  int xstart = xlen - 1;
6229   //  int ystart = ylen - 1;
6230   //  long carry = 0;
6231   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6232   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6233   //    z[kdx] = (int)product;
6234   //    carry = product >>> 32;
6235   //  }
6236   //  z[xstart] = (int)carry;
6237   //
6238 
6239   movl(idx, ylen);      // idx = ylen;
6240   movl(kdx, zlen);      // kdx = xlen+ylen;
6241   xorq(carry, carry);   // carry = 0;
6242 
6243   Label L_done;
6244 
6245   movl(xstart, xlen);
6246   decrementl(xstart);
6247   jcc(Assembler::negative, L_done);
6248 
6249   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6250 
6251   Label L_second_loop;
6252   testl(kdx, kdx);
6253   jcc(Assembler::zero, L_second_loop);
6254 
6255   Label L_carry;
6256   subl(kdx, 1);
6257   jcc(Assembler::zero, L_carry);
6258 
6259   movl(Address(z, kdx, Address::times_4,  0), carry);
6260   shrq(carry, 32);
6261   subl(kdx, 1);
6262 
6263   bind(L_carry);
6264   movl(Address(z, kdx, Address::times_4,  0), carry);
6265 
6266   // Second and third (nested) loops.
6267   //
6268   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6269   //   carry = 0;
6270   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6271   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6272   //                    (z[k] & LONG_MASK) + carry;
6273   //     z[k] = (int)product;
6274   //     carry = product >>> 32;
6275   //   }
6276   //   z[i] = (int)carry;
6277   // }
6278   //
6279   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6280 
6281   const Register jdx = tmp1;
6282 
6283   bind(L_second_loop);
6284   xorl(carry, carry);    // carry = 0;
6285   movl(jdx, ylen);       // j = ystart+1
6286 
6287   subl(xstart, 1);       // i = xstart-1;
6288   jcc(Assembler::negative, L_done);
6289 
6290   push (z);
6291 
6292   Label L_last_x;
6293   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6294   subl(xstart, 1);       // i = xstart-1;
6295   jcc(Assembler::negative, L_last_x);
6296 
6297   if (UseBMI2Instructions) {
6298     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6299     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6300   } else {
6301     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6302     rorq(x_xstart, 32);  // convert big-endian to little-endian
6303   }
6304 
6305   Label L_third_loop_prologue;
6306   bind(L_third_loop_prologue);
6307 
6308   push (x);
6309   push (xstart);
6310   push (ylen);
6311 
6312 
6313   if (UseBMI2Instructions) {
6314     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6315   } else { // !UseBMI2Instructions
6316     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6317   }
6318 
6319   pop(ylen);
6320   pop(xlen);
6321   pop(x);
6322   pop(z);
6323 
6324   movl(tmp3, xlen);
6325   addl(tmp3, 1);
6326   movl(Address(z, tmp3, Address::times_4,  0), carry);
6327   subl(tmp3, 1);
6328   jccb(Assembler::negative, L_done);
6329 
6330   shrq(carry, 32);
6331   movl(Address(z, tmp3, Address::times_4,  0), carry);
6332   jmp(L_second_loop);
6333 
6334   // Next infrequent code is moved outside loops.
6335   bind(L_last_x);
6336   if (UseBMI2Instructions) {
6337     movl(rdx, Address(x,  0));
6338   } else {
6339     movl(x_xstart, Address(x,  0));
6340   }
6341   jmp(L_third_loop_prologue);
6342 
6343   bind(L_done);
6344 
6345   pop(zlen);
6346   pop(xlen);
6347 
6348   pop(tmp5);
6349   pop(tmp4);
6350   pop(tmp3);
6351   pop(tmp2);
6352   pop(tmp1);
6353 }
6354 
6355 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6356   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6357   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6358   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6359   Label VECTOR8_TAIL, VECTOR4_TAIL;
6360   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6361   Label SAME_TILL_END, DONE;
6362   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6363 
6364   //scale is in rcx in both Win64 and Unix
6365   ShortBranchVerifier sbv(this);
6366 
6367   shlq(length);
6368   xorq(result, result);
6369 
6370   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6371       VM_Version::supports_avx512vlbw()) {
6372     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6373 
6374     cmpq(length, 64);
6375     jcc(Assembler::less, VECTOR32_TAIL);
6376 
6377     movq(tmp1, length);
6378     andq(tmp1, 0x3F);      // tail count
6379     andq(length, ~(0x3F)); //vector count
6380 
6381     bind(VECTOR64_LOOP);
6382     // AVX512 code to compare 64 byte vectors.
6383     evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit);
6384     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6385     kortestql(k7, k7);
6386     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6387     addq(result, 64);
6388     subq(length, 64);
6389     jccb(Assembler::notZero, VECTOR64_LOOP);
6390 
6391     //bind(VECTOR64_TAIL);
6392     testq(tmp1, tmp1);
6393     jcc(Assembler::zero, SAME_TILL_END);
6394 
6395     //bind(VECTOR64_TAIL);
6396     // AVX512 code to compare upto 63 byte vectors.
6397     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6398     shlxq(tmp2, tmp2, tmp1);
6399     notq(tmp2);
6400     kmovql(k3, tmp2);
6401 
6402     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6403     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6404 
6405     ktestql(k7, k3);
6406     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6407 
6408     bind(VECTOR64_NOT_EQUAL);
6409     kmovql(tmp1, k7);
6410     notq(tmp1);
6411     tzcntq(tmp1, tmp1);
6412     addq(result, tmp1);
6413     shrq(result);
6414     jmp(DONE);
6415     bind(VECTOR32_TAIL);
6416   }
6417 
6418   cmpq(length, 8);
6419   jcc(Assembler::equal, VECTOR8_LOOP);
6420   jcc(Assembler::less, VECTOR4_TAIL);
6421 
6422   if (UseAVX >= 2) {
6423     Label VECTOR16_TAIL, VECTOR32_LOOP;
6424 
6425     cmpq(length, 16);
6426     jcc(Assembler::equal, VECTOR16_LOOP);
6427     jcc(Assembler::less, VECTOR8_LOOP);
6428 
6429     cmpq(length, 32);
6430     jccb(Assembler::less, VECTOR16_TAIL);
6431 
6432     subq(length, 32);
6433     bind(VECTOR32_LOOP);
6434     vmovdqu(rymm0, Address(obja, result));
6435     vmovdqu(rymm1, Address(objb, result));
6436     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6437     vptest(rymm2, rymm2);
6438     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6439     addq(result, 32);
6440     subq(length, 32);
6441     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6442     addq(length, 32);
6443     jcc(Assembler::equal, SAME_TILL_END);
6444     //falling through if less than 32 bytes left //close the branch here.
6445 
6446     bind(VECTOR16_TAIL);
6447     cmpq(length, 16);
6448     jccb(Assembler::less, VECTOR8_TAIL);
6449     bind(VECTOR16_LOOP);
6450     movdqu(rymm0, Address(obja, result));
6451     movdqu(rymm1, Address(objb, result));
6452     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6453     ptest(rymm2, rymm2);
6454     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6455     addq(result, 16);
6456     subq(length, 16);
6457     jcc(Assembler::equal, SAME_TILL_END);
6458     //falling through if less than 16 bytes left
6459   } else {//regular intrinsics
6460 
6461     cmpq(length, 16);
6462     jccb(Assembler::less, VECTOR8_TAIL);
6463 
6464     subq(length, 16);
6465     bind(VECTOR16_LOOP);
6466     movdqu(rymm0, Address(obja, result));
6467     movdqu(rymm1, Address(objb, result));
6468     pxor(rymm0, rymm1);
6469     ptest(rymm0, rymm0);
6470     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6471     addq(result, 16);
6472     subq(length, 16);
6473     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6474     addq(length, 16);
6475     jcc(Assembler::equal, SAME_TILL_END);
6476     //falling through if less than 16 bytes left
6477   }
6478 
6479   bind(VECTOR8_TAIL);
6480   cmpq(length, 8);
6481   jccb(Assembler::less, VECTOR4_TAIL);
6482   bind(VECTOR8_LOOP);
6483   movq(tmp1, Address(obja, result));
6484   movq(tmp2, Address(objb, result));
6485   xorq(tmp1, tmp2);
6486   testq(tmp1, tmp1);
6487   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6488   addq(result, 8);
6489   subq(length, 8);
6490   jcc(Assembler::equal, SAME_TILL_END);
6491   //falling through if less than 8 bytes left
6492 
6493   bind(VECTOR4_TAIL);
6494   cmpq(length, 4);
6495   jccb(Assembler::less, BYTES_TAIL);
6496   bind(VECTOR4_LOOP);
6497   movl(tmp1, Address(obja, result));
6498   xorl(tmp1, Address(objb, result));
6499   testl(tmp1, tmp1);
6500   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6501   addq(result, 4);
6502   subq(length, 4);
6503   jcc(Assembler::equal, SAME_TILL_END);
6504   //falling through if less than 4 bytes left
6505 
6506   bind(BYTES_TAIL);
6507   bind(BYTES_LOOP);
6508   load_unsigned_byte(tmp1, Address(obja, result));
6509   load_unsigned_byte(tmp2, Address(objb, result));
6510   xorl(tmp1, tmp2);
6511   testl(tmp1, tmp1);
6512   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6513   decq(length);
6514   jcc(Assembler::zero, SAME_TILL_END);
6515   incq(result);
6516   load_unsigned_byte(tmp1, Address(obja, result));
6517   load_unsigned_byte(tmp2, Address(objb, result));
6518   xorl(tmp1, tmp2);
6519   testl(tmp1, tmp1);
6520   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6521   decq(length);
6522   jcc(Assembler::zero, SAME_TILL_END);
6523   incq(result);
6524   load_unsigned_byte(tmp1, Address(obja, result));
6525   load_unsigned_byte(tmp2, Address(objb, result));
6526   xorl(tmp1, tmp2);
6527   testl(tmp1, tmp1);
6528   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6529   jmp(SAME_TILL_END);
6530 
6531   if (UseAVX >= 2) {
6532     bind(VECTOR32_NOT_EQUAL);
6533     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6534     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6535     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6536     vpmovmskb(tmp1, rymm0);
6537     bsfq(tmp1, tmp1);
6538     addq(result, tmp1);
6539     shrq(result);
6540     jmp(DONE);
6541   }
6542 
6543   bind(VECTOR16_NOT_EQUAL);
6544   if (UseAVX >= 2) {
6545     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6546     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6547     pxor(rymm0, rymm2);
6548   } else {
6549     pcmpeqb(rymm2, rymm2);
6550     pxor(rymm0, rymm1);
6551     pcmpeqb(rymm0, rymm1);
6552     pxor(rymm0, rymm2);
6553   }
6554   pmovmskb(tmp1, rymm0);
6555   bsfq(tmp1, tmp1);
6556   addq(result, tmp1);
6557   shrq(result);
6558   jmpb(DONE);
6559 
6560   bind(VECTOR8_NOT_EQUAL);
6561   bind(VECTOR4_NOT_EQUAL);
6562   bsfq(tmp1, tmp1);
6563   shrq(tmp1, 3);
6564   addq(result, tmp1);
6565   bind(BYTES_NOT_EQUAL);
6566   shrq(result);
6567   jmpb(DONE);
6568 
6569   bind(SAME_TILL_END);
6570   mov64(result, -1);
6571 
6572   bind(DONE);
6573 }
6574 
6575 //Helper functions for square_to_len()
6576 
6577 /**
6578  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6579  * Preserves x and z and modifies rest of the registers.
6580  */
6581 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6582   // Perform square and right shift by 1
6583   // Handle odd xlen case first, then for even xlen do the following
6584   // jlong carry = 0;
6585   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6586   //     huge_128 product = x[j:j+1] * x[j:j+1];
6587   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6588   //     z[i+2:i+3] = (jlong)(product >>> 1);
6589   //     carry = (jlong)product;
6590   // }
6591 
6592   xorq(tmp5, tmp5);     // carry
6593   xorq(rdxReg, rdxReg);
6594   xorl(tmp1, tmp1);     // index for x
6595   xorl(tmp4, tmp4);     // index for z
6596 
6597   Label L_first_loop, L_first_loop_exit;
6598 
6599   testl(xlen, 1);
6600   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6601 
6602   // Square and right shift by 1 the odd element using 32 bit multiply
6603   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6604   imulq(raxReg, raxReg);
6605   shrq(raxReg, 1);
6606   adcq(tmp5, 0);
6607   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6608   incrementl(tmp1);
6609   addl(tmp4, 2);
6610 
6611   // Square and  right shift by 1 the rest using 64 bit multiply
6612   bind(L_first_loop);
6613   cmpptr(tmp1, xlen);
6614   jccb(Assembler::equal, L_first_loop_exit);
6615 
6616   // Square
6617   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6618   rorq(raxReg, 32);    // convert big-endian to little-endian
6619   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6620 
6621   // Right shift by 1 and save carry
6622   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6623   rcrq(rdxReg, 1);
6624   rcrq(raxReg, 1);
6625   adcq(tmp5, 0);
6626 
6627   // Store result in z
6628   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6629   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6630 
6631   // Update indices for x and z
6632   addl(tmp1, 2);
6633   addl(tmp4, 4);
6634   jmp(L_first_loop);
6635 
6636   bind(L_first_loop_exit);
6637 }
6638 
6639 
6640 /**
6641  * Perform the following multiply add operation using BMI2 instructions
6642  * carry:sum = sum + op1*op2 + carry
6643  * op2 should be in rdx
6644  * op2 is preserved, all other registers are modified
6645  */
6646 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6647   // assert op2 is rdx
6648   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
6649   addq(sum, carry);
6650   adcq(tmp2, 0);
6651   addq(sum, op1);
6652   adcq(tmp2, 0);
6653   movq(carry, tmp2);
6654 }
6655 
6656 /**
6657  * Perform the following multiply add operation:
6658  * carry:sum = sum + op1*op2 + carry
6659  * Preserves op1, op2 and modifies rest of registers
6660  */
6661 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6662   // rdx:rax = op1 * op2
6663   movq(raxReg, op2);
6664   mulq(op1);
6665 
6666   //  rdx:rax = sum + carry + rdx:rax
6667   addq(sum, carry);
6668   adcq(rdxReg, 0);
6669   addq(sum, raxReg);
6670   adcq(rdxReg, 0);
6671 
6672   // carry:sum = rdx:sum
6673   movq(carry, rdxReg);
6674 }
6675 
6676 /**
6677  * Add 64 bit long carry into z[] with carry propogation.
6678  * Preserves z and carry register values and modifies rest of registers.
6679  *
6680  */
6681 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6682   Label L_fourth_loop, L_fourth_loop_exit;
6683 
6684   movl(tmp1, 1);
6685   subl(zlen, 2);
6686   addq(Address(z, zlen, Address::times_4, 0), carry);
6687 
6688   bind(L_fourth_loop);
6689   jccb(Assembler::carryClear, L_fourth_loop_exit);
6690   subl(zlen, 2);
6691   jccb(Assembler::negative, L_fourth_loop_exit);
6692   addq(Address(z, zlen, Address::times_4, 0), tmp1);
6693   jmp(L_fourth_loop);
6694   bind(L_fourth_loop_exit);
6695 }
6696 
6697 /**
6698  * Shift z[] left by 1 bit.
6699  * Preserves x, len, z and zlen registers and modifies rest of the registers.
6700  *
6701  */
6702 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6703 
6704   Label L_fifth_loop, L_fifth_loop_exit;
6705 
6706   // Fifth loop
6707   // Perform primitiveLeftShift(z, zlen, 1)
6708 
6709   const Register prev_carry = tmp1;
6710   const Register new_carry = tmp4;
6711   const Register value = tmp2;
6712   const Register zidx = tmp3;
6713 
6714   // int zidx, carry;
6715   // long value;
6716   // carry = 0;
6717   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6718   //    (carry:value)  = (z[i] << 1) | carry ;
6719   //    z[i] = value;
6720   // }
6721 
6722   movl(zidx, zlen);
6723   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6724 
6725   bind(L_fifth_loop);
6726   decl(zidx);  // Use decl to preserve carry flag
6727   decl(zidx);
6728   jccb(Assembler::negative, L_fifth_loop_exit);
6729 
6730   if (UseBMI2Instructions) {
6731      movq(value, Address(z, zidx, Address::times_4, 0));
6732      rclq(value, 1);
6733      rorxq(value, value, 32);
6734      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6735   }
6736   else {
6737     // clear new_carry
6738     xorl(new_carry, new_carry);
6739 
6740     // Shift z[i] by 1, or in previous carry and save new carry
6741     movq(value, Address(z, zidx, Address::times_4, 0));
6742     shlq(value, 1);
6743     adcl(new_carry, 0);
6744 
6745     orq(value, prev_carry);
6746     rorq(value, 0x20);
6747     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6748 
6749     // Set previous carry = new carry
6750     movl(prev_carry, new_carry);
6751   }
6752   jmp(L_fifth_loop);
6753 
6754   bind(L_fifth_loop_exit);
6755 }
6756 
6757 
6758 /**
6759  * Code for BigInteger::squareToLen() intrinsic
6760  *
6761  * rdi: x
6762  * rsi: len
6763  * r8:  z
6764  * rcx: zlen
6765  * r12: tmp1
6766  * r13: tmp2
6767  * r14: tmp3
6768  * r15: tmp4
6769  * rbx: tmp5
6770  *
6771  */
6772 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6773 
6774   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6775   push(tmp1);
6776   push(tmp2);
6777   push(tmp3);
6778   push(tmp4);
6779   push(tmp5);
6780 
6781   // First loop
6782   // Store the squares, right shifted one bit (i.e., divided by 2).
6783   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6784 
6785   // Add in off-diagonal sums.
6786   //
6787   // Second, third (nested) and fourth loops.
6788   // zlen +=2;
6789   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6790   //    carry = 0;
6791   //    long op2 = x[xidx:xidx+1];
6792   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6793   //       k -= 2;
6794   //       long op1 = x[j:j+1];
6795   //       long sum = z[k:k+1];
6796   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6797   //       z[k:k+1] = sum;
6798   //    }
6799   //    add_one_64(z, k, carry, tmp_regs);
6800   // }
6801 
6802   const Register carry = tmp5;
6803   const Register sum = tmp3;
6804   const Register op1 = tmp4;
6805   Register op2 = tmp2;
6806 
6807   push(zlen);
6808   push(len);
6809   addl(zlen,2);
6810   bind(L_second_loop);
6811   xorq(carry, carry);
6812   subl(zlen, 4);
6813   subl(len, 2);
6814   push(zlen);
6815   push(len);
6816   cmpl(len, 0);
6817   jccb(Assembler::lessEqual, L_second_loop_exit);
6818 
6819   // Multiply an array by one 64 bit long.
6820   if (UseBMI2Instructions) {
6821     op2 = rdxReg;
6822     movq(op2, Address(x, len, Address::times_4,  0));
6823     rorxq(op2, op2, 32);
6824   }
6825   else {
6826     movq(op2, Address(x, len, Address::times_4,  0));
6827     rorq(op2, 32);
6828   }
6829 
6830   bind(L_third_loop);
6831   decrementl(len);
6832   jccb(Assembler::negative, L_third_loop_exit);
6833   decrementl(len);
6834   jccb(Assembler::negative, L_last_x);
6835 
6836   movq(op1, Address(x, len, Address::times_4,  0));
6837   rorq(op1, 32);
6838 
6839   bind(L_multiply);
6840   subl(zlen, 2);
6841   movq(sum, Address(z, zlen, Address::times_4,  0));
6842 
6843   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6844   if (UseBMI2Instructions) {
6845     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6846   }
6847   else {
6848     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6849   }
6850 
6851   movq(Address(z, zlen, Address::times_4, 0), sum);
6852 
6853   jmp(L_third_loop);
6854   bind(L_third_loop_exit);
6855 
6856   // Fourth loop
6857   // Add 64 bit long carry into z with carry propogation.
6858   // Uses offsetted zlen.
6859   add_one_64(z, zlen, carry, tmp1);
6860 
6861   pop(len);
6862   pop(zlen);
6863   jmp(L_second_loop);
6864 
6865   // Next infrequent code is moved outside loops.
6866   bind(L_last_x);
6867   movl(op1, Address(x, 0));
6868   jmp(L_multiply);
6869 
6870   bind(L_second_loop_exit);
6871   pop(len);
6872   pop(zlen);
6873   pop(len);
6874   pop(zlen);
6875 
6876   // Fifth loop
6877   // Shift z left 1 bit.
6878   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6879 
6880   // z[zlen-1] |= x[len-1] & 1;
6881   movl(tmp3, Address(x, len, Address::times_4, -4));
6882   andl(tmp3, 1);
6883   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
6884 
6885   pop(tmp5);
6886   pop(tmp4);
6887   pop(tmp3);
6888   pop(tmp2);
6889   pop(tmp1);
6890 }
6891 
6892 /**
6893  * Helper function for mul_add()
6894  * Multiply the in[] by int k and add to out[] starting at offset offs using
6895  * 128 bit by 32 bit multiply and return the carry in tmp5.
6896  * Only quad int aligned length of in[] is operated on in this function.
6897  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6898  * This function preserves out, in and k registers.
6899  * len and offset point to the appropriate index in "in" & "out" correspondingly
6900  * tmp5 has the carry.
6901  * other registers are temporary and are modified.
6902  *
6903  */
6904 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6905   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6906   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6907 
6908   Label L_first_loop, L_first_loop_exit;
6909 
6910   movl(tmp1, len);
6911   shrl(tmp1, 2);
6912 
6913   bind(L_first_loop);
6914   subl(tmp1, 1);
6915   jccb(Assembler::negative, L_first_loop_exit);
6916 
6917   subl(len, 4);
6918   subl(offset, 4);
6919 
6920   Register op2 = tmp2;
6921   const Register sum = tmp3;
6922   const Register op1 = tmp4;
6923   const Register carry = tmp5;
6924 
6925   if (UseBMI2Instructions) {
6926     op2 = rdxReg;
6927   }
6928 
6929   movq(op1, Address(in, len, Address::times_4,  8));
6930   rorq(op1, 32);
6931   movq(sum, Address(out, offset, Address::times_4,  8));
6932   rorq(sum, 32);
6933   if (UseBMI2Instructions) {
6934     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6935   }
6936   else {
6937     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6938   }
6939   // Store back in big endian from little endian
6940   rorq(sum, 0x20);
6941   movq(Address(out, offset, Address::times_4,  8), sum);
6942 
6943   movq(op1, Address(in, len, Address::times_4,  0));
6944   rorq(op1, 32);
6945   movq(sum, Address(out, offset, Address::times_4,  0));
6946   rorq(sum, 32);
6947   if (UseBMI2Instructions) {
6948     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6949   }
6950   else {
6951     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6952   }
6953   // Store back in big endian from little endian
6954   rorq(sum, 0x20);
6955   movq(Address(out, offset, Address::times_4,  0), sum);
6956 
6957   jmp(L_first_loop);
6958   bind(L_first_loop_exit);
6959 }
6960 
6961 /**
6962  * Code for BigInteger::mulAdd() intrinsic
6963  *
6964  * rdi: out
6965  * rsi: in
6966  * r11: offs (out.length - offset)
6967  * rcx: len
6968  * r8:  k
6969  * r12: tmp1
6970  * r13: tmp2
6971  * r14: tmp3
6972  * r15: tmp4
6973  * rbx: tmp5
6974  * Multiply the in[] by word k and add to out[], return the carry in rax
6975  */
6976 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6977    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6978    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6979 
6980   Label L_carry, L_last_in, L_done;
6981 
6982 // carry = 0;
6983 // for (int j=len-1; j >= 0; j--) {
6984 //    long product = (in[j] & LONG_MASK) * kLong +
6985 //                   (out[offs] & LONG_MASK) + carry;
6986 //    out[offs--] = (int)product;
6987 //    carry = product >>> 32;
6988 // }
6989 //
6990   push(tmp1);
6991   push(tmp2);
6992   push(tmp3);
6993   push(tmp4);
6994   push(tmp5);
6995 
6996   Register op2 = tmp2;
6997   const Register sum = tmp3;
6998   const Register op1 = tmp4;
6999   const Register carry =  tmp5;
7000 
7001   if (UseBMI2Instructions) {
7002     op2 = rdxReg;
7003     movl(op2, k);
7004   }
7005   else {
7006     movl(op2, k);
7007   }
7008 
7009   xorq(carry, carry);
7010 
7011   //First loop
7012 
7013   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7014   //The carry is in tmp5
7015   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7016 
7017   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7018   decrementl(len);
7019   jccb(Assembler::negative, L_carry);
7020   decrementl(len);
7021   jccb(Assembler::negative, L_last_in);
7022 
7023   movq(op1, Address(in, len, Address::times_4,  0));
7024   rorq(op1, 32);
7025 
7026   subl(offs, 2);
7027   movq(sum, Address(out, offs, Address::times_4,  0));
7028   rorq(sum, 32);
7029 
7030   if (UseBMI2Instructions) {
7031     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7032   }
7033   else {
7034     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7035   }
7036 
7037   // Store back in big endian from little endian
7038   rorq(sum, 0x20);
7039   movq(Address(out, offs, Address::times_4,  0), sum);
7040 
7041   testl(len, len);
7042   jccb(Assembler::zero, L_carry);
7043 
7044   //Multiply the last in[] entry, if any
7045   bind(L_last_in);
7046   movl(op1, Address(in, 0));
7047   movl(sum, Address(out, offs, Address::times_4,  -4));
7048 
7049   movl(raxReg, k);
7050   mull(op1); //tmp4 * eax -> edx:eax
7051   addl(sum, carry);
7052   adcl(rdxReg, 0);
7053   addl(sum, raxReg);
7054   adcl(rdxReg, 0);
7055   movl(carry, rdxReg);
7056 
7057   movl(Address(out, offs, Address::times_4,  -4), sum);
7058 
7059   bind(L_carry);
7060   //return tmp5/carry as carry in rax
7061   movl(rax, carry);
7062 
7063   bind(L_done);
7064   pop(tmp5);
7065   pop(tmp4);
7066   pop(tmp3);
7067   pop(tmp2);
7068   pop(tmp1);
7069 }
7070 #endif
7071 
7072 /**
7073  * Emits code to update CRC-32 with a byte value according to constants in table
7074  *
7075  * @param [in,out]crc   Register containing the crc.
7076  * @param [in]val       Register containing the byte to fold into the CRC.
7077  * @param [in]table     Register containing the table of crc constants.
7078  *
7079  * uint32_t crc;
7080  * val = crc_table[(val ^ crc) & 0xFF];
7081  * crc = val ^ (crc >> 8);
7082  *
7083  */
7084 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7085   xorl(val, crc);
7086   andl(val, 0xFF);
7087   shrl(crc, 8); // unsigned shift
7088   xorl(crc, Address(table, val, Address::times_4, 0));
7089 }
7090 
7091 /**
7092  * Fold 128-bit data chunk
7093  */
7094 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7095   if (UseAVX > 0) {
7096     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7097     vpclmulldq(xcrc, xK, xcrc); // [63:0]
7098     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7099     pxor(xcrc, xtmp);
7100   } else {
7101     movdqa(xtmp, xcrc);
7102     pclmulhdq(xtmp, xK);   // [123:64]
7103     pclmulldq(xcrc, xK);   // [63:0]
7104     pxor(xcrc, xtmp);
7105     movdqu(xtmp, Address(buf, offset));
7106     pxor(xcrc, xtmp);
7107   }
7108 }
7109 
7110 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7111   if (UseAVX > 0) {
7112     vpclmulhdq(xtmp, xK, xcrc);
7113     vpclmulldq(xcrc, xK, xcrc);
7114     pxor(xcrc, xbuf);
7115     pxor(xcrc, xtmp);
7116   } else {
7117     movdqa(xtmp, xcrc);
7118     pclmulhdq(xtmp, xK);
7119     pclmulldq(xcrc, xK);
7120     pxor(xcrc, xbuf);
7121     pxor(xcrc, xtmp);
7122   }
7123 }
7124 
7125 /**
7126  * 8-bit folds to compute 32-bit CRC
7127  *
7128  * uint64_t xcrc;
7129  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7130  */
7131 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7132   movdl(tmp, xcrc);
7133   andl(tmp, 0xFF);
7134   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7135   psrldq(xcrc, 1); // unsigned shift one byte
7136   pxor(xcrc, xtmp);
7137 }
7138 
7139 /**
7140  * uint32_t crc;
7141  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7142  */
7143 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7144   movl(tmp, crc);
7145   andl(tmp, 0xFF);
7146   shrl(crc, 8);
7147   xorl(crc, Address(table, tmp, Address::times_4, 0));
7148 }
7149 
7150 /**
7151  * @param crc   register containing existing CRC (32-bit)
7152  * @param buf   register pointing to input byte buffer (byte*)
7153  * @param len   register containing number of bytes
7154  * @param table register that will contain address of CRC table
7155  * @param tmp   scratch register
7156  */
7157 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7158   assert_different_registers(crc, buf, len, table, tmp, rax);
7159 
7160   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7161   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7162 
7163   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7164   // context for the registers used, where all instructions below are using 128-bit mode
7165   // On EVEX without VL and BW, these instructions will all be AVX.
7166   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7167   notl(crc); // ~crc
7168   cmpl(len, 16);
7169   jcc(Assembler::less, L_tail);
7170 
7171   // Align buffer to 16 bytes
7172   movl(tmp, buf);
7173   andl(tmp, 0xF);
7174   jccb(Assembler::zero, L_aligned);
7175   subl(tmp,  16);
7176   addl(len, tmp);
7177 
7178   align(4);
7179   BIND(L_align_loop);
7180   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7181   update_byte_crc32(crc, rax, table);
7182   increment(buf);
7183   incrementl(tmp);
7184   jccb(Assembler::less, L_align_loop);
7185 
7186   BIND(L_aligned);
7187   movl(tmp, len); // save
7188   shrl(len, 4);
7189   jcc(Assembler::zero, L_tail_restore);
7190 
7191   // Fold crc into first bytes of vector
7192   movdqa(xmm1, Address(buf, 0));
7193   movdl(rax, xmm1);
7194   xorl(crc, rax);
7195   if (VM_Version::supports_sse4_1()) {
7196     pinsrd(xmm1, crc, 0);
7197   } else {
7198     pinsrw(xmm1, crc, 0);
7199     shrl(crc, 16);
7200     pinsrw(xmm1, crc, 1);
7201   }
7202   addptr(buf, 16);
7203   subl(len, 4); // len > 0
7204   jcc(Assembler::less, L_fold_tail);
7205 
7206   movdqa(xmm2, Address(buf,  0));
7207   movdqa(xmm3, Address(buf, 16));
7208   movdqa(xmm4, Address(buf, 32));
7209   addptr(buf, 48);
7210   subl(len, 3);
7211   jcc(Assembler::lessEqual, L_fold_512b);
7212 
7213   // Fold total 512 bits of polynomial on each iteration,
7214   // 128 bits per each of 4 parallel streams.
7215   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7216 
7217   align32();
7218   BIND(L_fold_512b_loop);
7219   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7220   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7221   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7222   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7223   addptr(buf, 64);
7224   subl(len, 4);
7225   jcc(Assembler::greater, L_fold_512b_loop);
7226 
7227   // Fold 512 bits to 128 bits.
7228   BIND(L_fold_512b);
7229   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7230   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7231   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7232   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7233 
7234   // Fold the rest of 128 bits data chunks
7235   BIND(L_fold_tail);
7236   addl(len, 3);
7237   jccb(Assembler::lessEqual, L_fold_128b);
7238   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7239 
7240   BIND(L_fold_tail_loop);
7241   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7242   addptr(buf, 16);
7243   decrementl(len);
7244   jccb(Assembler::greater, L_fold_tail_loop);
7245 
7246   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7247   BIND(L_fold_128b);
7248   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7249   if (UseAVX > 0) {
7250     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7251     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7252     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7253   } else {
7254     movdqa(xmm2, xmm0);
7255     pclmulqdq(xmm2, xmm1, 0x1);
7256     movdqa(xmm3, xmm0);
7257     pand(xmm3, xmm2);
7258     pclmulqdq(xmm0, xmm3, 0x1);
7259   }
7260   psrldq(xmm1, 8);
7261   psrldq(xmm2, 4);
7262   pxor(xmm0, xmm1);
7263   pxor(xmm0, xmm2);
7264 
7265   // 8 8-bit folds to compute 32-bit CRC.
7266   for (int j = 0; j < 4; j++) {
7267     fold_8bit_crc32(xmm0, table, xmm1, rax);
7268   }
7269   movdl(crc, xmm0); // mov 32 bits to general register
7270   for (int j = 0; j < 4; j++) {
7271     fold_8bit_crc32(crc, table, rax);
7272   }
7273 
7274   BIND(L_tail_restore);
7275   movl(len, tmp); // restore
7276   BIND(L_tail);
7277   andl(len, 0xf);
7278   jccb(Assembler::zero, L_exit);
7279 
7280   // Fold the rest of bytes
7281   align(4);
7282   BIND(L_tail_loop);
7283   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7284   update_byte_crc32(crc, rax, table);
7285   increment(buf);
7286   decrementl(len);
7287   jccb(Assembler::greater, L_tail_loop);
7288 
7289   BIND(L_exit);
7290   notl(crc); // ~c
7291 }
7292 
7293 #ifdef _LP64
7294 // Helper function for AVX 512 CRC32
7295 // Fold 512-bit data chunks
7296 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7297                                              Register pos, int offset) {
7298   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7299   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7300   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7301   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7302   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7303 }
7304 
7305 // Helper function for AVX 512 CRC32
7306 // Compute CRC32 for < 256B buffers
7307 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7308                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7309                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7310 
7311   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7312   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7313   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7314 
7315   // check if there is enough buffer to be able to fold 16B at a time
7316   cmpl(len, 32);
7317   jcc(Assembler::less, L_less_than_32);
7318 
7319   // if there is, load the constants
7320   movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
7321   movdl(xmm0, crc);                        // get the initial crc value
7322   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7323   pxor(xmm7, xmm0);
7324 
7325   // update the buffer pointer
7326   addl(pos, 16);
7327   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7328   subl(len, 32);
7329   jmp(L_16B_reduction_loop);
7330 
7331   bind(L_less_than_32);
7332   //mov initial crc to the return value. this is necessary for zero - length buffers.
7333   movl(rax, crc);
7334   testl(len, len);
7335   jcc(Assembler::equal, L_cleanup);
7336 
7337   movdl(xmm0, crc);                        //get the initial crc value
7338 
7339   cmpl(len, 16);
7340   jcc(Assembler::equal, L_exact_16_left);
7341   jcc(Assembler::less, L_less_than_16_left);
7342 
7343   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7344   pxor(xmm7, xmm0);                       //xor the initial crc value
7345   addl(pos, 16);
7346   subl(len, 16);
7347   movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
7348   jmp(L_get_last_two_xmms);
7349 
7350   bind(L_less_than_16_left);
7351   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7352   pxor(xmm1, xmm1);
7353   movptr(tmp1, rsp);
7354   movdqu(Address(tmp1, 0 * 16), xmm1);
7355 
7356   cmpl(len, 4);
7357   jcc(Assembler::less, L_only_less_than_4);
7358 
7359   //backup the counter value
7360   movl(tmp2, len);
7361   cmpl(len, 8);
7362   jcc(Assembler::less, L_less_than_8_left);
7363 
7364   //load 8 Bytes
7365   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7366   movq(Address(tmp1, 0 * 16), rax);
7367   addptr(tmp1, 8);
7368   subl(len, 8);
7369   addl(pos, 8);
7370 
7371   bind(L_less_than_8_left);
7372   cmpl(len, 4);
7373   jcc(Assembler::less, L_less_than_4_left);
7374 
7375   //load 4 Bytes
7376   movl(rax, Address(buf, pos, Address::times_1, 0));
7377   movl(Address(tmp1, 0 * 16), rax);
7378   addptr(tmp1, 4);
7379   subl(len, 4);
7380   addl(pos, 4);
7381 
7382   bind(L_less_than_4_left);
7383   cmpl(len, 2);
7384   jcc(Assembler::less, L_less_than_2_left);
7385 
7386   // load 2 Bytes
7387   movw(rax, Address(buf, pos, Address::times_1, 0));
7388   movl(Address(tmp1, 0 * 16), rax);
7389   addptr(tmp1, 2);
7390   subl(len, 2);
7391   addl(pos, 2);
7392 
7393   bind(L_less_than_2_left);
7394   cmpl(len, 1);
7395   jcc(Assembler::less, L_zero_left);
7396 
7397   // load 1 Byte
7398   movb(rax, Address(buf, pos, Address::times_1, 0));
7399   movb(Address(tmp1, 0 * 16), rax);
7400 
7401   bind(L_zero_left);
7402   movdqu(xmm7, Address(rsp, 0));
7403   pxor(xmm7, xmm0);                       //xor the initial crc value
7404 
7405   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7406   movdqu(xmm0, Address(rax, tmp2));
7407   pshufb(xmm7, xmm0);
7408   jmp(L_128_done);
7409 
7410   bind(L_exact_16_left);
7411   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7412   pxor(xmm7, xmm0);                       //xor the initial crc value
7413   jmp(L_128_done);
7414 
7415   bind(L_only_less_than_4);
7416   cmpl(len, 3);
7417   jcc(Assembler::less, L_only_less_than_3);
7418 
7419   // load 3 Bytes
7420   movb(rax, Address(buf, pos, Address::times_1, 0));
7421   movb(Address(tmp1, 0), rax);
7422 
7423   movb(rax, Address(buf, pos, Address::times_1, 1));
7424   movb(Address(tmp1, 1), rax);
7425 
7426   movb(rax, Address(buf, pos, Address::times_1, 2));
7427   movb(Address(tmp1, 2), rax);
7428 
7429   movdqu(xmm7, Address(rsp, 0));
7430   pxor(xmm7, xmm0);                     //xor the initial crc value
7431 
7432   pslldq(xmm7, 0x5);
7433   jmp(L_barrett);
7434   bind(L_only_less_than_3);
7435   cmpl(len, 2);
7436   jcc(Assembler::less, L_only_less_than_2);
7437 
7438   // load 2 Bytes
7439   movb(rax, Address(buf, pos, Address::times_1, 0));
7440   movb(Address(tmp1, 0), rax);
7441 
7442   movb(rax, Address(buf, pos, Address::times_1, 1));
7443   movb(Address(tmp1, 1), rax);
7444 
7445   movdqu(xmm7, Address(rsp, 0));
7446   pxor(xmm7, xmm0);                     //xor the initial crc value
7447 
7448   pslldq(xmm7, 0x6);
7449   jmp(L_barrett);
7450 
7451   bind(L_only_less_than_2);
7452   //load 1 Byte
7453   movb(rax, Address(buf, pos, Address::times_1, 0));
7454   movb(Address(tmp1, 0), rax);
7455 
7456   movdqu(xmm7, Address(rsp, 0));
7457   pxor(xmm7, xmm0);                     //xor the initial crc value
7458 
7459   pslldq(xmm7, 0x7);
7460 }
7461 
7462 /**
7463 * Compute CRC32 using AVX512 instructions
7464 * param crc   register containing existing CRC (32-bit)
7465 * param buf   register pointing to input byte buffer (byte*)
7466 * param len   register containing number of bytes
7467 * param table address of crc or crc32c table
7468 * param tmp1  scratch register
7469 * param tmp2  scratch register
7470 * return rax  result register
7471 *
7472 * This routine is identical for crc32c with the exception of the precomputed constant
7473 * table which will be passed as the table argument.  The calculation steps are
7474 * the same for both variants.
7475 */
7476 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7477   assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7478 
7479   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7480   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7481   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7482   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7483   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7484 
7485   const Register pos = r12;
7486   push(r12);
7487   subptr(rsp, 16 * 2 + 8);
7488 
7489   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7490   // context for the registers used, where all instructions below are using 128-bit mode
7491   // On EVEX without VL and BW, these instructions will all be AVX.
7492   movl(pos, 0);
7493 
7494   // check if smaller than 256B
7495   cmpl(len, 256);
7496   jcc(Assembler::less, L_less_than_256);
7497 
7498   // load the initial crc value
7499   movdl(xmm10, crc);
7500 
7501   // receive the initial 64B data, xor the initial crc value
7502   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7503   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7504   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7505   evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7506 
7507   subl(len, 256);
7508   cmpl(len, 256);
7509   jcc(Assembler::less, L_fold_128_B_loop);
7510 
7511   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7512   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7513   evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7514   subl(len, 256);
7515 
7516   bind(L_fold_256_B_loop);
7517   addl(pos, 256);
7518   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7519   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7520   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7521   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7522 
7523   subl(len, 256);
7524   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7525 
7526   // Fold 256 into 128
7527   addl(pos, 256);
7528   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7529   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7530   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7531 
7532   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7533   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7534   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7535 
7536   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7537   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7538 
7539   addl(len, 128);
7540   jmp(L_fold_128_B_register);
7541 
7542   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7543   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7544 
7545   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7546   bind(L_fold_128_B_loop);
7547   addl(pos, 128);
7548   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7549   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7550 
7551   subl(len, 128);
7552   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7553 
7554   addl(pos, 128);
7555 
7556   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7557   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7558   bind(L_fold_128_B_register);
7559   evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7560   evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7561   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7562   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7563   // save last that has no multiplicand
7564   vextracti64x2(xmm7, xmm4, 3);
7565 
7566   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7567   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7568   // Needed later in reduction loop
7569   movdqu(xmm10, Address(table, 1 * 16));
7570   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7571   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7572 
7573   // Swap 1,0,3,2 - 01 00 11 10
7574   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7575   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7576   vextracti128(xmm5, xmm8, 1);
7577   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7578 
7579   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7580   // instead of a cmp instruction, we use the negative flag with the jl instruction
7581   addl(len, 128 - 16);
7582   jcc(Assembler::less, L_final_reduction_for_128);
7583 
7584   bind(L_16B_reduction_loop);
7585   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7586   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7587   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7588   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7589   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7590   addl(pos, 16);
7591   subl(len, 16);
7592   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7593 
7594   bind(L_final_reduction_for_128);
7595   addl(len, 16);
7596   jcc(Assembler::equal, L_128_done);
7597 
7598   bind(L_get_last_two_xmms);
7599   movdqu(xmm2, xmm7);
7600   addl(pos, len);
7601   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7602   subl(pos, len);
7603 
7604   // get rid of the extra data that was loaded before
7605   // load the shift constant
7606   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7607   movdqu(xmm0, Address(rax, len));
7608   addl(rax, len);
7609 
7610   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7611   //Change mask to 512
7612   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7613   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7614 
7615   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7616   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7617   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7618   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7619   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7620 
7621   bind(L_128_done);
7622   // compute crc of a 128-bit value
7623   movdqu(xmm10, Address(table, 3 * 16));
7624   movdqu(xmm0, xmm7);
7625 
7626   // 64b fold
7627   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7628   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7629   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7630 
7631   // 32b fold
7632   movdqu(xmm0, xmm7);
7633   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7634   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7635   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7636   jmp(L_barrett);
7637 
7638   bind(L_less_than_256);
7639   kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7640 
7641   //barrett reduction
7642   bind(L_barrett);
7643   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7644   movdqu(xmm1, xmm7);
7645   movdqu(xmm2, xmm7);
7646   movdqu(xmm10, Address(table, 4 * 16));
7647 
7648   pclmulqdq(xmm7, xmm10, 0x0);
7649   pxor(xmm7, xmm2);
7650   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7651   movdqu(xmm2, xmm7);
7652   pclmulqdq(xmm7, xmm10, 0x10);
7653   pxor(xmm7, xmm2);
7654   pxor(xmm7, xmm1);
7655   pextrd(crc, xmm7, 2);
7656 
7657   bind(L_cleanup);
7658   addptr(rsp, 16 * 2 + 8);
7659   pop(r12);
7660 }
7661 
7662 // S. Gueron / Information Processing Letters 112 (2012) 184
7663 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7664 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7665 // Output: the 64-bit carry-less product of B * CONST
7666 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7667                                      Register tmp1, Register tmp2, Register tmp3) {
7668   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7669   if (n > 0) {
7670     addq(tmp3, n * 256 * 8);
7671   }
7672   //    Q1 = TABLEExt[n][B & 0xFF];
7673   movl(tmp1, in);
7674   andl(tmp1, 0x000000FF);
7675   shll(tmp1, 3);
7676   addq(tmp1, tmp3);
7677   movq(tmp1, Address(tmp1, 0));
7678 
7679   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7680   movl(tmp2, in);
7681   shrl(tmp2, 8);
7682   andl(tmp2, 0x000000FF);
7683   shll(tmp2, 3);
7684   addq(tmp2, tmp3);
7685   movq(tmp2, Address(tmp2, 0));
7686 
7687   shlq(tmp2, 8);
7688   xorq(tmp1, tmp2);
7689 
7690   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7691   movl(tmp2, in);
7692   shrl(tmp2, 16);
7693   andl(tmp2, 0x000000FF);
7694   shll(tmp2, 3);
7695   addq(tmp2, tmp3);
7696   movq(tmp2, Address(tmp2, 0));
7697 
7698   shlq(tmp2, 16);
7699   xorq(tmp1, tmp2);
7700 
7701   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7702   shrl(in, 24);
7703   andl(in, 0x000000FF);
7704   shll(in, 3);
7705   addq(in, tmp3);
7706   movq(in, Address(in, 0));
7707 
7708   shlq(in, 24);
7709   xorq(in, tmp1);
7710   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7711 }
7712 
7713 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7714                                       Register in_out,
7715                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7716                                       XMMRegister w_xtmp2,
7717                                       Register tmp1,
7718                                       Register n_tmp2, Register n_tmp3) {
7719   if (is_pclmulqdq_supported) {
7720     movdl(w_xtmp1, in_out); // modified blindly
7721 
7722     movl(tmp1, const_or_pre_comp_const_index);
7723     movdl(w_xtmp2, tmp1);
7724     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7725 
7726     movdq(in_out, w_xtmp1);
7727   } else {
7728     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7729   }
7730 }
7731 
7732 // Recombination Alternative 2: No bit-reflections
7733 // T1 = (CRC_A * U1) << 1
7734 // T2 = (CRC_B * U2) << 1
7735 // C1 = T1 >> 32
7736 // C2 = T2 >> 32
7737 // T1 = T1 & 0xFFFFFFFF
7738 // T2 = T2 & 0xFFFFFFFF
7739 // T1 = CRC32(0, T1)
7740 // T2 = CRC32(0, T2)
7741 // C1 = C1 ^ T1
7742 // C2 = C2 ^ T2
7743 // CRC = C1 ^ C2 ^ CRC_C
7744 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7745                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7746                                      Register tmp1, Register tmp2,
7747                                      Register n_tmp3) {
7748   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7749   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7750   shlq(in_out, 1);
7751   movl(tmp1, in_out);
7752   shrq(in_out, 32);
7753   xorl(tmp2, tmp2);
7754   crc32(tmp2, tmp1, 4);
7755   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7756   shlq(in1, 1);
7757   movl(tmp1, in1);
7758   shrq(in1, 32);
7759   xorl(tmp2, tmp2);
7760   crc32(tmp2, tmp1, 4);
7761   xorl(in1, tmp2);
7762   xorl(in_out, in1);
7763   xorl(in_out, in2);
7764 }
7765 
7766 // Set N to predefined value
7767 // Subtract from a lenght of a buffer
7768 // execute in a loop:
7769 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7770 // for i = 1 to N do
7771 //  CRC_A = CRC32(CRC_A, A[i])
7772 //  CRC_B = CRC32(CRC_B, B[i])
7773 //  CRC_C = CRC32(CRC_C, C[i])
7774 // end for
7775 // Recombine
7776 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7777                                        Register in_out1, Register in_out2, Register in_out3,
7778                                        Register tmp1, Register tmp2, Register tmp3,
7779                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7780                                        Register tmp4, Register tmp5,
7781                                        Register n_tmp6) {
7782   Label L_processPartitions;
7783   Label L_processPartition;
7784   Label L_exit;
7785 
7786   bind(L_processPartitions);
7787   cmpl(in_out1, 3 * size);
7788   jcc(Assembler::less, L_exit);
7789     xorl(tmp1, tmp1);
7790     xorl(tmp2, tmp2);
7791     movq(tmp3, in_out2);
7792     addq(tmp3, size);
7793 
7794     bind(L_processPartition);
7795       crc32(in_out3, Address(in_out2, 0), 8);
7796       crc32(tmp1, Address(in_out2, size), 8);
7797       crc32(tmp2, Address(in_out2, size * 2), 8);
7798       addq(in_out2, 8);
7799       cmpq(in_out2, tmp3);
7800       jcc(Assembler::less, L_processPartition);
7801     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7802             w_xtmp1, w_xtmp2, w_xtmp3,
7803             tmp4, tmp5,
7804             n_tmp6);
7805     addq(in_out2, 2 * size);
7806     subl(in_out1, 3 * size);
7807     jmp(L_processPartitions);
7808 
7809   bind(L_exit);
7810 }
7811 #else
7812 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7813                                      Register tmp1, Register tmp2, Register tmp3,
7814                                      XMMRegister xtmp1, XMMRegister xtmp2) {
7815   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7816   if (n > 0) {
7817     addl(tmp3, n * 256 * 8);
7818   }
7819   //    Q1 = TABLEExt[n][B & 0xFF];
7820   movl(tmp1, in_out);
7821   andl(tmp1, 0x000000FF);
7822   shll(tmp1, 3);
7823   addl(tmp1, tmp3);
7824   movq(xtmp1, Address(tmp1, 0));
7825 
7826   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7827   movl(tmp2, in_out);
7828   shrl(tmp2, 8);
7829   andl(tmp2, 0x000000FF);
7830   shll(tmp2, 3);
7831   addl(tmp2, tmp3);
7832   movq(xtmp2, Address(tmp2, 0));
7833 
7834   psllq(xtmp2, 8);
7835   pxor(xtmp1, xtmp2);
7836 
7837   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7838   movl(tmp2, in_out);
7839   shrl(tmp2, 16);
7840   andl(tmp2, 0x000000FF);
7841   shll(tmp2, 3);
7842   addl(tmp2, tmp3);
7843   movq(xtmp2, Address(tmp2, 0));
7844 
7845   psllq(xtmp2, 16);
7846   pxor(xtmp1, xtmp2);
7847 
7848   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7849   shrl(in_out, 24);
7850   andl(in_out, 0x000000FF);
7851   shll(in_out, 3);
7852   addl(in_out, tmp3);
7853   movq(xtmp2, Address(in_out, 0));
7854 
7855   psllq(xtmp2, 24);
7856   pxor(xtmp1, xtmp2); // Result in CXMM
7857   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7858 }
7859 
7860 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7861                                       Register in_out,
7862                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7863                                       XMMRegister w_xtmp2,
7864                                       Register tmp1,
7865                                       Register n_tmp2, Register n_tmp3) {
7866   if (is_pclmulqdq_supported) {
7867     movdl(w_xtmp1, in_out);
7868 
7869     movl(tmp1, const_or_pre_comp_const_index);
7870     movdl(w_xtmp2, tmp1);
7871     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7872     // Keep result in XMM since GPR is 32 bit in length
7873   } else {
7874     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7875   }
7876 }
7877 
7878 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7879                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7880                                      Register tmp1, Register tmp2,
7881                                      Register n_tmp3) {
7882   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7883   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7884 
7885   psllq(w_xtmp1, 1);
7886   movdl(tmp1, w_xtmp1);
7887   psrlq(w_xtmp1, 32);
7888   movdl(in_out, w_xtmp1);
7889 
7890   xorl(tmp2, tmp2);
7891   crc32(tmp2, tmp1, 4);
7892   xorl(in_out, tmp2);
7893 
7894   psllq(w_xtmp2, 1);
7895   movdl(tmp1, w_xtmp2);
7896   psrlq(w_xtmp2, 32);
7897   movdl(in1, w_xtmp2);
7898 
7899   xorl(tmp2, tmp2);
7900   crc32(tmp2, tmp1, 4);
7901   xorl(in1, tmp2);
7902   xorl(in_out, in1);
7903   xorl(in_out, in2);
7904 }
7905 
7906 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7907                                        Register in_out1, Register in_out2, Register in_out3,
7908                                        Register tmp1, Register tmp2, Register tmp3,
7909                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7910                                        Register tmp4, Register tmp5,
7911                                        Register n_tmp6) {
7912   Label L_processPartitions;
7913   Label L_processPartition;
7914   Label L_exit;
7915 
7916   bind(L_processPartitions);
7917   cmpl(in_out1, 3 * size);
7918   jcc(Assembler::less, L_exit);
7919     xorl(tmp1, tmp1);
7920     xorl(tmp2, tmp2);
7921     movl(tmp3, in_out2);
7922     addl(tmp3, size);
7923 
7924     bind(L_processPartition);
7925       crc32(in_out3, Address(in_out2, 0), 4);
7926       crc32(tmp1, Address(in_out2, size), 4);
7927       crc32(tmp2, Address(in_out2, size*2), 4);
7928       crc32(in_out3, Address(in_out2, 0+4), 4);
7929       crc32(tmp1, Address(in_out2, size+4), 4);
7930       crc32(tmp2, Address(in_out2, size*2+4), 4);
7931       addl(in_out2, 8);
7932       cmpl(in_out2, tmp3);
7933       jcc(Assembler::less, L_processPartition);
7934 
7935         push(tmp3);
7936         push(in_out1);
7937         push(in_out2);
7938         tmp4 = tmp3;
7939         tmp5 = in_out1;
7940         n_tmp6 = in_out2;
7941 
7942       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7943             w_xtmp1, w_xtmp2, w_xtmp3,
7944             tmp4, tmp5,
7945             n_tmp6);
7946 
7947         pop(in_out2);
7948         pop(in_out1);
7949         pop(tmp3);
7950 
7951     addl(in_out2, 2 * size);
7952     subl(in_out1, 3 * size);
7953     jmp(L_processPartitions);
7954 
7955   bind(L_exit);
7956 }
7957 #endif //LP64
7958 
7959 #ifdef _LP64
7960 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7961 // Input: A buffer I of L bytes.
7962 // Output: the CRC32C value of the buffer.
7963 // Notations:
7964 // Write L = 24N + r, with N = floor (L/24).
7965 // r = L mod 24 (0 <= r < 24).
7966 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7967 // N quadwords, and R consists of r bytes.
7968 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7969 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7970 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7971 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7972 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7973                                           Register tmp1, Register tmp2, Register tmp3,
7974                                           Register tmp4, Register tmp5, Register tmp6,
7975                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7976                                           bool is_pclmulqdq_supported) {
7977   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7978   Label L_wordByWord;
7979   Label L_byteByByteProlog;
7980   Label L_byteByByte;
7981   Label L_exit;
7982 
7983   if (is_pclmulqdq_supported ) {
7984     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7985     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7986 
7987     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7988     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7989 
7990     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7991     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7992     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7993   } else {
7994     const_or_pre_comp_const_index[0] = 1;
7995     const_or_pre_comp_const_index[1] = 0;
7996 
7997     const_or_pre_comp_const_index[2] = 3;
7998     const_or_pre_comp_const_index[3] = 2;
7999 
8000     const_or_pre_comp_const_index[4] = 5;
8001     const_or_pre_comp_const_index[5] = 4;
8002    }
8003   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8004                     in2, in1, in_out,
8005                     tmp1, tmp2, tmp3,
8006                     w_xtmp1, w_xtmp2, w_xtmp3,
8007                     tmp4, tmp5,
8008                     tmp6);
8009   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8010                     in2, in1, in_out,
8011                     tmp1, tmp2, tmp3,
8012                     w_xtmp1, w_xtmp2, w_xtmp3,
8013                     tmp4, tmp5,
8014                     tmp6);
8015   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8016                     in2, in1, in_out,
8017                     tmp1, tmp2, tmp3,
8018                     w_xtmp1, w_xtmp2, w_xtmp3,
8019                     tmp4, tmp5,
8020                     tmp6);
8021   movl(tmp1, in2);
8022   andl(tmp1, 0x00000007);
8023   negl(tmp1);
8024   addl(tmp1, in2);
8025   addq(tmp1, in1);
8026 
8027   cmpq(in1, tmp1);
8028   jccb(Assembler::greaterEqual, L_byteByByteProlog);
8029   align(16);
8030   BIND(L_wordByWord);
8031     crc32(in_out, Address(in1, 0), 8);
8032     addq(in1, 8);
8033     cmpq(in1, tmp1);
8034     jcc(Assembler::less, L_wordByWord);
8035 
8036   BIND(L_byteByByteProlog);
8037   andl(in2, 0x00000007);
8038   movl(tmp2, 1);
8039 
8040   cmpl(tmp2, in2);
8041   jccb(Assembler::greater, L_exit);
8042   BIND(L_byteByByte);
8043     crc32(in_out, Address(in1, 0), 1);
8044     incq(in1);
8045     incl(tmp2);
8046     cmpl(tmp2, in2);
8047     jcc(Assembler::lessEqual, L_byteByByte);
8048 
8049   BIND(L_exit);
8050 }
8051 #else
8052 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8053                                           Register tmp1, Register  tmp2, Register tmp3,
8054                                           Register tmp4, Register  tmp5, Register tmp6,
8055                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8056                                           bool is_pclmulqdq_supported) {
8057   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8058   Label L_wordByWord;
8059   Label L_byteByByteProlog;
8060   Label L_byteByByte;
8061   Label L_exit;
8062 
8063   if (is_pclmulqdq_supported) {
8064     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8065     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
8066 
8067     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8068     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8069 
8070     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8071     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8072   } else {
8073     const_or_pre_comp_const_index[0] = 1;
8074     const_or_pre_comp_const_index[1] = 0;
8075 
8076     const_or_pre_comp_const_index[2] = 3;
8077     const_or_pre_comp_const_index[3] = 2;
8078 
8079     const_or_pre_comp_const_index[4] = 5;
8080     const_or_pre_comp_const_index[5] = 4;
8081   }
8082   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8083                     in2, in1, in_out,
8084                     tmp1, tmp2, tmp3,
8085                     w_xtmp1, w_xtmp2, w_xtmp3,
8086                     tmp4, tmp5,
8087                     tmp6);
8088   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8089                     in2, in1, in_out,
8090                     tmp1, tmp2, tmp3,
8091                     w_xtmp1, w_xtmp2, w_xtmp3,
8092                     tmp4, tmp5,
8093                     tmp6);
8094   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8095                     in2, in1, in_out,
8096                     tmp1, tmp2, tmp3,
8097                     w_xtmp1, w_xtmp2, w_xtmp3,
8098                     tmp4, tmp5,
8099                     tmp6);
8100   movl(tmp1, in2);
8101   andl(tmp1, 0x00000007);
8102   negl(tmp1);
8103   addl(tmp1, in2);
8104   addl(tmp1, in1);
8105 
8106   BIND(L_wordByWord);
8107   cmpl(in1, tmp1);
8108   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8109     crc32(in_out, Address(in1,0), 4);
8110     addl(in1, 4);
8111     jmp(L_wordByWord);
8112 
8113   BIND(L_byteByByteProlog);
8114   andl(in2, 0x00000007);
8115   movl(tmp2, 1);
8116 
8117   BIND(L_byteByByte);
8118   cmpl(tmp2, in2);
8119   jccb(Assembler::greater, L_exit);
8120     movb(tmp1, Address(in1, 0));
8121     crc32(in_out, tmp1, 1);
8122     incl(in1);
8123     incl(tmp2);
8124     jmp(L_byteByByte);
8125 
8126   BIND(L_exit);
8127 }
8128 #endif // LP64
8129 #undef BIND
8130 #undef BLOCK_COMMENT
8131 
8132 // Compress char[] array to byte[].
8133 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8134 //   @IntrinsicCandidate
8135 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8136 //     for (int i = 0; i < len; i++) {
8137 //       int c = src[srcOff++];
8138 //       if (c >>> 8 != 0) {
8139 //         return 0;
8140 //       }
8141 //       dst[dstOff++] = (byte)c;
8142 //     }
8143 //     return len;
8144 //   }
8145 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8146   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8147   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8148   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8149   Label copy_chars_loop, return_length, return_zero, done;
8150 
8151   // rsi: src
8152   // rdi: dst
8153   // rdx: len
8154   // rcx: tmp5
8155   // rax: result
8156 
8157   // rsi holds start addr of source char[] to be compressed
8158   // rdi holds start addr of destination byte[]
8159   // rdx holds length
8160 
8161   assert(len != result, "");
8162 
8163   // save length for return
8164   push(len);
8165 
8166   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8167     VM_Version::supports_avx512vlbw() &&
8168     VM_Version::supports_bmi2()) {
8169 
8170     Label copy_32_loop, copy_loop_tail, below_threshold;
8171 
8172     // alignment
8173     Label post_alignment;
8174 
8175     // if length of the string is less than 16, handle it in an old fashioned way
8176     testl(len, -32);
8177     jcc(Assembler::zero, below_threshold);
8178 
8179     // First check whether a character is compressable ( <= 0xFF).
8180     // Create mask to test for Unicode chars inside zmm vector
8181     movl(result, 0x00FF);
8182     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8183 
8184     testl(len, -64);
8185     jcc(Assembler::zero, post_alignment);
8186 
8187     movl(tmp5, dst);
8188     andl(tmp5, (32 - 1));
8189     negl(tmp5);
8190     andl(tmp5, (32 - 1));
8191 
8192     // bail out when there is nothing to be done
8193     testl(tmp5, 0xFFFFFFFF);
8194     jcc(Assembler::zero, post_alignment);
8195 
8196     // ~(~0 << len), where len is the # of remaining elements to process
8197     movl(result, 0xFFFFFFFF);
8198     shlxl(result, result, tmp5);
8199     notl(result);
8200     kmovdl(mask2, result);
8201 
8202     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8203     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8204     ktestd(mask1, mask2);
8205     jcc(Assembler::carryClear, return_zero);
8206 
8207     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8208 
8209     addptr(src, tmp5);
8210     addptr(src, tmp5);
8211     addptr(dst, tmp5);
8212     subl(len, tmp5);
8213 
8214     bind(post_alignment);
8215     // end of alignment
8216 
8217     movl(tmp5, len);
8218     andl(tmp5, (32 - 1));    // tail count (in chars)
8219     andl(len, ~(32 - 1));    // vector count (in chars)
8220     jcc(Assembler::zero, copy_loop_tail);
8221 
8222     lea(src, Address(src, len, Address::times_2));
8223     lea(dst, Address(dst, len, Address::times_1));
8224     negptr(len);
8225 
8226     bind(copy_32_loop);
8227     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit);
8228     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8229     kortestdl(mask1, mask1);
8230     jcc(Assembler::carryClear, return_zero);
8231 
8232     // All elements in current processed chunk are valid candidates for
8233     // compression. Write a truncated byte elements to the memory.
8234     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8235     addptr(len, 32);
8236     jcc(Assembler::notZero, copy_32_loop);
8237 
8238     bind(copy_loop_tail);
8239     // bail out when there is nothing to be done
8240     testl(tmp5, 0xFFFFFFFF);
8241     jcc(Assembler::zero, return_length);
8242 
8243     movl(len, tmp5);
8244 
8245     // ~(~0 << len), where len is the # of remaining elements to process
8246     movl(result, 0xFFFFFFFF);
8247     shlxl(result, result, len);
8248     notl(result);
8249 
8250     kmovdl(mask2, result);
8251 
8252     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8253     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8254     ktestd(mask1, mask2);
8255     jcc(Assembler::carryClear, return_zero);
8256 
8257     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8258     jmp(return_length);
8259 
8260     bind(below_threshold);
8261   }
8262 
8263   if (UseSSE42Intrinsics) {
8264     Label copy_32_loop, copy_16, copy_tail;
8265 
8266     movl(result, len);
8267 
8268     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8269 
8270     // vectored compression
8271     andl(len, 0xfffffff0);    // vector count (in chars)
8272     andl(result, 0x0000000f);    // tail count (in chars)
8273     testl(len, len);
8274     jcc(Assembler::zero, copy_16);
8275 
8276     // compress 16 chars per iter
8277     movdl(tmp1Reg, tmp5);
8278     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8279     pxor(tmp4Reg, tmp4Reg);
8280 
8281     lea(src, Address(src, len, Address::times_2));
8282     lea(dst, Address(dst, len, Address::times_1));
8283     negptr(len);
8284 
8285     bind(copy_32_loop);
8286     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8287     por(tmp4Reg, tmp2Reg);
8288     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8289     por(tmp4Reg, tmp3Reg);
8290     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8291     jcc(Assembler::notZero, return_zero);
8292     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8293     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8294     addptr(len, 16);
8295     jcc(Assembler::notZero, copy_32_loop);
8296 
8297     // compress next vector of 8 chars (if any)
8298     bind(copy_16);
8299     movl(len, result);
8300     andl(len, 0xfffffff8);    // vector count (in chars)
8301     andl(result, 0x00000007);    // tail count (in chars)
8302     testl(len, len);
8303     jccb(Assembler::zero, copy_tail);
8304 
8305     movdl(tmp1Reg, tmp5);
8306     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8307     pxor(tmp3Reg, tmp3Reg);
8308 
8309     movdqu(tmp2Reg, Address(src, 0));
8310     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8311     jccb(Assembler::notZero, return_zero);
8312     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8313     movq(Address(dst, 0), tmp2Reg);
8314     addptr(src, 16);
8315     addptr(dst, 8);
8316 
8317     bind(copy_tail);
8318     movl(len, result);
8319   }
8320   // compress 1 char per iter
8321   testl(len, len);
8322   jccb(Assembler::zero, return_length);
8323   lea(src, Address(src, len, Address::times_2));
8324   lea(dst, Address(dst, len, Address::times_1));
8325   negptr(len);
8326 
8327   bind(copy_chars_loop);
8328   load_unsigned_short(result, Address(src, len, Address::times_2));
8329   testl(result, 0xff00);      // check if Unicode char
8330   jccb(Assembler::notZero, return_zero);
8331   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8332   increment(len);
8333   jcc(Assembler::notZero, copy_chars_loop);
8334 
8335   // if compression succeeded, return length
8336   bind(return_length);
8337   pop(result);
8338   jmpb(done);
8339 
8340   // if compression failed, return 0
8341   bind(return_zero);
8342   xorl(result, result);
8343   addptr(rsp, wordSize);
8344 
8345   bind(done);
8346 }
8347 
8348 // Inflate byte[] array to char[].
8349 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8350 //   @IntrinsicCandidate
8351 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8352 //     for (int i = 0; i < len; i++) {
8353 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8354 //     }
8355 //   }
8356 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8357   XMMRegister tmp1, Register tmp2, KRegister mask) {
8358   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8359   // rsi: src
8360   // rdi: dst
8361   // rdx: len
8362   // rcx: tmp2
8363 
8364   // rsi holds start addr of source byte[] to be inflated
8365   // rdi holds start addr of destination char[]
8366   // rdx holds length
8367   assert_different_registers(src, dst, len, tmp2);
8368   movl(tmp2, len);
8369   if ((UseAVX > 2) && // AVX512
8370     VM_Version::supports_avx512vlbw() &&
8371     VM_Version::supports_bmi2()) {
8372 
8373     Label copy_32_loop, copy_tail;
8374     Register tmp3_aliased = len;
8375 
8376     // if length of the string is less than 16, handle it in an old fashioned way
8377     testl(len, -16);
8378     jcc(Assembler::zero, below_threshold);
8379 
8380     testl(len, -1 * AVX3Threshold);
8381     jcc(Assembler::zero, avx3_threshold);
8382 
8383     // In order to use only one arithmetic operation for the main loop we use
8384     // this pre-calculation
8385     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8386     andl(len, -32);     // vector count
8387     jccb(Assembler::zero, copy_tail);
8388 
8389     lea(src, Address(src, len, Address::times_1));
8390     lea(dst, Address(dst, len, Address::times_2));
8391     negptr(len);
8392 
8393 
8394     // inflate 32 chars per iter
8395     bind(copy_32_loop);
8396     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8397     evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit);
8398     addptr(len, 32);
8399     jcc(Assembler::notZero, copy_32_loop);
8400 
8401     bind(copy_tail);
8402     // bail out when there is nothing to be done
8403     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8404     jcc(Assembler::zero, done);
8405 
8406     // ~(~0 << length), where length is the # of remaining elements to process
8407     movl(tmp3_aliased, -1);
8408     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8409     notl(tmp3_aliased);
8410     kmovdl(mask, tmp3_aliased);
8411     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8412     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8413 
8414     jmp(done);
8415     bind(avx3_threshold);
8416   }
8417   if (UseSSE42Intrinsics) {
8418     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8419 
8420     if (UseAVX > 1) {
8421       andl(tmp2, (16 - 1));
8422       andl(len, -16);
8423       jccb(Assembler::zero, copy_new_tail);
8424     } else {
8425       andl(tmp2, 0x00000007);   // tail count (in chars)
8426       andl(len, 0xfffffff8);    // vector count (in chars)
8427       jccb(Assembler::zero, copy_tail);
8428     }
8429 
8430     // vectored inflation
8431     lea(src, Address(src, len, Address::times_1));
8432     lea(dst, Address(dst, len, Address::times_2));
8433     negptr(len);
8434 
8435     if (UseAVX > 1) {
8436       bind(copy_16_loop);
8437       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8438       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8439       addptr(len, 16);
8440       jcc(Assembler::notZero, copy_16_loop);
8441 
8442       bind(below_threshold);
8443       bind(copy_new_tail);
8444       movl(len, tmp2);
8445       andl(tmp2, 0x00000007);
8446       andl(len, 0xFFFFFFF8);
8447       jccb(Assembler::zero, copy_tail);
8448 
8449       pmovzxbw(tmp1, Address(src, 0));
8450       movdqu(Address(dst, 0), tmp1);
8451       addptr(src, 8);
8452       addptr(dst, 2 * 8);
8453 
8454       jmp(copy_tail, true);
8455     }
8456 
8457     // inflate 8 chars per iter
8458     bind(copy_8_loop);
8459     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8460     movdqu(Address(dst, len, Address::times_2), tmp1);
8461     addptr(len, 8);
8462     jcc(Assembler::notZero, copy_8_loop);
8463 
8464     bind(copy_tail);
8465     movl(len, tmp2);
8466 
8467     cmpl(len, 4);
8468     jccb(Assembler::less, copy_bytes);
8469 
8470     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8471     pmovzxbw(tmp1, tmp1);
8472     movq(Address(dst, 0), tmp1);
8473     subptr(len, 4);
8474     addptr(src, 4);
8475     addptr(dst, 8);
8476 
8477     bind(copy_bytes);
8478   } else {
8479     bind(below_threshold);
8480   }
8481 
8482   testl(len, len);
8483   jccb(Assembler::zero, done);
8484   lea(src, Address(src, len, Address::times_1));
8485   lea(dst, Address(dst, len, Address::times_2));
8486   negptr(len);
8487 
8488   // inflate 1 char per iter
8489   bind(copy_chars_loop);
8490   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8491   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8492   increment(len);
8493   jcc(Assembler::notZero, copy_chars_loop);
8494 
8495   bind(done);
8496 }
8497 
8498 
8499 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
8500   switch(type) {
8501     case T_BYTE:
8502     case T_BOOLEAN:
8503       evmovdqub(dst, kmask, src, false, vector_len);
8504       break;
8505     case T_CHAR:
8506     case T_SHORT:
8507       evmovdquw(dst, kmask, src, false, vector_len);
8508       break;
8509     case T_INT:
8510     case T_FLOAT:
8511       evmovdqul(dst, kmask, src, false, vector_len);
8512       break;
8513     case T_LONG:
8514     case T_DOUBLE:
8515       evmovdquq(dst, kmask, src, false, vector_len);
8516       break;
8517     default:
8518       fatal("Unexpected type argument %s", type2name(type));
8519       break;
8520   }
8521 }
8522 
8523 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
8524   switch(type) {
8525     case T_BYTE:
8526     case T_BOOLEAN:
8527       evmovdqub(dst, kmask, src, true, vector_len);
8528       break;
8529     case T_CHAR:
8530     case T_SHORT:
8531       evmovdquw(dst, kmask, src, true, vector_len);
8532       break;
8533     case T_INT:
8534     case T_FLOAT:
8535       evmovdqul(dst, kmask, src, true, vector_len);
8536       break;
8537     case T_LONG:
8538     case T_DOUBLE:
8539       evmovdquq(dst, kmask, src, true, vector_len);
8540       break;
8541     default:
8542       fatal("Unexpected type argument %s", type2name(type));
8543       break;
8544   }
8545 }
8546 
8547 #if COMPILER2_OR_JVMCI
8548 
8549 
8550 // Set memory operation for length "less than" 64 bytes.
8551 void MacroAssembler::fill64_masked_avx(uint shift, Register dst, int disp,
8552                                        XMMRegister xmm, KRegister mask, Register length,
8553                                        Register temp, bool use64byteVector) {
8554   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8555   assert(shift != 0, "shift value should be 1 (short),2(int) or 3(long)");
8556   BasicType type[] = { T_BYTE, T_SHORT,  T_INT,   T_LONG};
8557   if (!use64byteVector) {
8558     fill32_avx(dst, disp, xmm);
8559     subptr(length, 32 >> shift);
8560     fill32_masked_avx(shift, dst, disp + 32, xmm, mask, length, temp);
8561   } else {
8562     assert(MaxVectorSize == 64, "vector length != 64");
8563     movl(temp, 1);
8564     shlxl(temp, temp, length);
8565     subptr(temp, 1);
8566     kmovwl(mask, temp);
8567     evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_512bit);
8568   }
8569 }
8570 
8571 
8572 void MacroAssembler::fill32_masked_avx(uint shift, Register dst, int disp,
8573                                        XMMRegister xmm, KRegister mask, Register length,
8574                                        Register temp) {
8575   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8576   assert(shift != 0, "shift value should be 1 (short), 2(int) or 3(long)");
8577   BasicType type[] = { T_BYTE, T_SHORT,  T_INT,   T_LONG};
8578   movl(temp, 1);
8579   shlxl(temp, temp, length);
8580   subptr(temp, 1);
8581   kmovwl(mask, temp);
8582   evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_256bit);
8583 }
8584 
8585 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
8586   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8587   vmovdqu(dst, xmm);
8588 }
8589 
8590 void MacroAssembler::fill32_avx(Register dst, int disp, XMMRegister xmm) {
8591   fill32(Address(dst, disp), xmm);
8592 }
8593 
8594 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
8595   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8596   if (!use64byteVector) {
8597     fill32(dst, xmm);
8598     fill32(dst.plus_disp(32), xmm);
8599   } else {
8600     evmovdquq(dst, xmm, Assembler::AVX_512bit);
8601   }
8602 }
8603 
8604 void MacroAssembler::fill64_avx(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
8605   fill64(Address(dst, disp), xmm, use64byteVector);
8606 }
8607 
8608 #endif //COMPILER2_OR_JVMCI
8609 
8610 
8611 #ifdef _LP64
8612 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
8613   Label done;
8614   cvttss2sil(dst, src);
8615   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8616   cmpl(dst, 0x80000000); // float_sign_flip
8617   jccb(Assembler::notEqual, done);
8618   subptr(rsp, 8);
8619   movflt(Address(rsp, 0), src);
8620   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
8621   pop(dst);
8622   bind(done);
8623 }
8624 
8625 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
8626   Label done;
8627   cvttsd2sil(dst, src);
8628   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8629   cmpl(dst, 0x80000000); // float_sign_flip
8630   jccb(Assembler::notEqual, done);
8631   subptr(rsp, 8);
8632   movdbl(Address(rsp, 0), src);
8633   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
8634   pop(dst);
8635   bind(done);
8636 }
8637 
8638 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
8639   Label done;
8640   cvttss2siq(dst, src);
8641   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8642   jccb(Assembler::notEqual, done);
8643   subptr(rsp, 8);
8644   movflt(Address(rsp, 0), src);
8645   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
8646   pop(dst);
8647   bind(done);
8648 }
8649 
8650 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
8651   Label done;
8652   cvttsd2siq(dst, src);
8653   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8654   jccb(Assembler::notEqual, done);
8655   subptr(rsp, 8);
8656   movdbl(Address(rsp, 0), src);
8657   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
8658   pop(dst);
8659   bind(done);
8660 }
8661 
8662 void MacroAssembler::cache_wb(Address line)
8663 {
8664   // 64 bit cpus always support clflush
8665   assert(VM_Version::supports_clflush(), "clflush should be available");
8666   bool optimized = VM_Version::supports_clflushopt();
8667   bool no_evict = VM_Version::supports_clwb();
8668 
8669   // prefer clwb (writeback without evict) otherwise
8670   // prefer clflushopt (potentially parallel writeback with evict)
8671   // otherwise fallback on clflush (serial writeback with evict)
8672 
8673   if (optimized) {
8674     if (no_evict) {
8675       clwb(line);
8676     } else {
8677       clflushopt(line);
8678     }
8679   } else {
8680     // no need for fence when using CLFLUSH
8681     clflush(line);
8682   }
8683 }
8684 
8685 void MacroAssembler::cache_wbsync(bool is_pre)
8686 {
8687   assert(VM_Version::supports_clflush(), "clflush should be available");
8688   bool optimized = VM_Version::supports_clflushopt();
8689   bool no_evict = VM_Version::supports_clwb();
8690 
8691   // pick the correct implementation
8692 
8693   if (!is_pre && (optimized || no_evict)) {
8694     // need an sfence for post flush when using clflushopt or clwb
8695     // otherwise no no need for any synchroniaztion
8696 
8697     sfence();
8698   }
8699 }
8700 
8701 #endif // _LP64
8702 
8703 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
8704   switch (cond) {
8705     // Note some conditions are synonyms for others
8706     case Assembler::zero:         return Assembler::notZero;
8707     case Assembler::notZero:      return Assembler::zero;
8708     case Assembler::less:         return Assembler::greaterEqual;
8709     case Assembler::lessEqual:    return Assembler::greater;
8710     case Assembler::greater:      return Assembler::lessEqual;
8711     case Assembler::greaterEqual: return Assembler::less;
8712     case Assembler::below:        return Assembler::aboveEqual;
8713     case Assembler::belowEqual:   return Assembler::above;
8714     case Assembler::above:        return Assembler::belowEqual;
8715     case Assembler::aboveEqual:   return Assembler::below;
8716     case Assembler::overflow:     return Assembler::noOverflow;
8717     case Assembler::noOverflow:   return Assembler::overflow;
8718     case Assembler::negative:     return Assembler::positive;
8719     case Assembler::positive:     return Assembler::negative;
8720     case Assembler::parity:       return Assembler::noParity;
8721     case Assembler::noParity:     return Assembler::parity;
8722   }
8723   ShouldNotReachHere(); return Assembler::overflow;
8724 }
8725 
8726 SkipIfEqual::SkipIfEqual(
8727     MacroAssembler* masm, const bool* flag_addr, bool value) {
8728   _masm = masm;
8729   _masm->cmp8(ExternalAddress((address)flag_addr), value);
8730   _masm->jcc(Assembler::equal, _label);
8731 }
8732 
8733 SkipIfEqual::~SkipIfEqual() {
8734   _masm->bind(_label);
8735 }
8736 
8737 // 32-bit Windows has its own fast-path implementation
8738 // of get_thread
8739 #if !defined(WIN32) || defined(_LP64)
8740 
8741 // This is simply a call to Thread::current()
8742 void MacroAssembler::get_thread(Register thread) {
8743   if (thread != rax) {
8744     push(rax);
8745   }
8746   LP64_ONLY(push(rdi);)
8747   LP64_ONLY(push(rsi);)
8748   push(rdx);
8749   push(rcx);
8750 #ifdef _LP64
8751   push(r8);
8752   push(r9);
8753   push(r10);
8754   push(r11);
8755 #endif
8756 
8757   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
8758 
8759 #ifdef _LP64
8760   pop(r11);
8761   pop(r10);
8762   pop(r9);
8763   pop(r8);
8764 #endif
8765   pop(rcx);
8766   pop(rdx);
8767   LP64_ONLY(pop(rsi);)
8768   LP64_ONLY(pop(rdi);)
8769   if (thread != rax) {
8770     mov(thread, rax);
8771     pop(rax);
8772   }
8773 }
8774 
8775 #endif // !WIN32 || _LP64
8776 
8777 // Implements fast-locking.
8778 // Branches to slow upon failure to lock the object, with ZF cleared.
8779 // Falls through upon success with unspecified ZF.
8780 //
8781 // obj: the object to be locked
8782 // hdr: the (pre-loaded) header of the object, must be rax
8783 // thread: the thread which attempts to lock obj
8784 // tmp: a temporary register
8785 void MacroAssembler::fast_lock_impl(Register obj, Register hdr, Register thread, Register tmp, Label& slow) {
8786   assert(hdr == rax, "header must be in rax for cmpxchg");
8787   assert_different_registers(obj, hdr, thread, tmp);
8788 
8789   // First we need to check if the lock-stack has room for pushing the object reference.
8790   // Note: we subtract 1 from the end-offset so that we can do a 'greater' comparison, instead
8791   // of 'greaterEqual' below, which readily clears the ZF. This makes C2 code a little simpler and
8792   // avoids one branch.
8793   cmpl(Address(thread, JavaThread::lock_stack_top_offset()), LockStack::end_offset() - 1);
8794   jcc(Assembler::greater, slow);
8795 
8796   // Now we attempt to take the fast-lock.
8797   // Clear lock_mask bits (locked state).
8798   andptr(hdr, ~(int32_t)markWord::lock_mask_in_place);
8799   movptr(tmp, hdr);
8800   // Set unlocked_value bit.
8801   orptr(hdr, markWord::unlocked_value);
8802   lock();
8803   cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
8804   jcc(Assembler::notEqual, slow);
8805 
8806   // If successful, push object to lock-stack.
8807   movl(tmp, Address(thread, JavaThread::lock_stack_top_offset()));
8808   movptr(Address(thread, tmp), obj);
8809   incrementl(tmp, oopSize);
8810   movl(Address(thread, JavaThread::lock_stack_top_offset()), tmp);
8811 }
8812 
8813 // Implements fast-unlocking.
8814 // Branches to slow upon failure, with ZF cleared.
8815 // Falls through upon success, with unspecified ZF.
8816 //
8817 // obj: the object to be unlocked
8818 // hdr: the (pre-loaded) header of the object, must be rax
8819 // tmp: a temporary register
8820 void MacroAssembler::fast_unlock_impl(Register obj, Register hdr, Register tmp, Label& slow) {
8821   assert(hdr == rax, "header must be in rax for cmpxchg");
8822   assert_different_registers(obj, hdr, tmp);
8823 
8824   // Mark-word must be lock_mask now, try to swing it back to unlocked_value.
8825   movptr(tmp, hdr); // The expected old value
8826   orptr(tmp, markWord::unlocked_value);
8827   lock();
8828   cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
8829   jcc(Assembler::notEqual, slow);
8830   // Pop the lock object from the lock-stack.
8831 #ifdef _LP64
8832   const Register thread = r15_thread;
8833 #else
8834   const Register thread = rax;
8835   get_thread(thread);
8836 #endif
8837   subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
8838 #ifdef ASSERT
8839   movl(tmp, Address(thread, JavaThread::lock_stack_top_offset()));
8840   movptr(Address(thread, tmp), 0);
8841 #endif
8842 }