1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/compiler_globals.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/bytecodeHistogram.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/resourceArea.hpp"
  38 #include "memory/universe.hpp"
  39 #include "oops/accessDecorators.hpp"
  40 #include "oops/compressedOops.inline.hpp"
  41 #include "oops/klass.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/flags/flagSetting.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.hpp"
  47 #include "runtime/objectMonitor.hpp"
  48 #include "runtime/os.hpp"
  49 #include "runtime/safepoint.hpp"
  50 #include "runtime/safepointMechanism.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "runtime/thread.hpp"
  54 #include "utilities/macros.hpp"
  55 #include "crc32c.h"
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #define STOP(error) stop(error)
  60 #else
  61 #define BLOCK_COMMENT(str) block_comment(str)
  62 #define STOP(error) block_comment(error); stop(error)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 #ifdef ASSERT
  68 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  69 #endif
  70 
  71 static Assembler::Condition reverse[] = {
  72     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  73     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  74     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  75     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  76     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  77     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  78     Assembler::above          /* belowEqual    = 0x6 */ ,
  79     Assembler::belowEqual     /* above         = 0x7 */ ,
  80     Assembler::positive       /* negative      = 0x8 */ ,
  81     Assembler::negative       /* positive      = 0x9 */ ,
  82     Assembler::noParity       /* parity        = 0xa */ ,
  83     Assembler::parity         /* noParity      = 0xb */ ,
  84     Assembler::greaterEqual   /* less          = 0xc */ ,
  85     Assembler::less           /* greaterEqual  = 0xd */ ,
  86     Assembler::greater        /* lessEqual     = 0xe */ ,
  87     Assembler::lessEqual      /* greater       = 0xf, */
  88 
  89 };
  90 
  91 
  92 // Implementation of MacroAssembler
  93 
  94 // First all the versions that have distinct versions depending on 32/64 bit
  95 // Unless the difference is trivial (1 line or so).
  96 
  97 #ifndef _LP64
  98 
  99 // 32bit versions
 100 
 101 Address MacroAssembler::as_Address(AddressLiteral adr) {
 102   return Address(adr.target(), adr.rspec());
 103 }
 104 
 105 Address MacroAssembler::as_Address(ArrayAddress adr) {
 106   return Address::make_array(adr);
 107 }
 108 
 109 void MacroAssembler::call_VM_leaf_base(address entry_point,
 110                                        int number_of_arguments) {
 111   call(RuntimeAddress(entry_point));
 112   increment(rsp, number_of_arguments * wordSize);
 113 }
 114 
 115 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 116   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 117 }
 118 
 119 
 120 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 121   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 122 }
 123 
 124 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 125   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 126 }
 127 
 128 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 129   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 130 }
 131 
 132 void MacroAssembler::extend_sign(Register hi, Register lo) {
 133   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 134   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 135     cdql();
 136   } else {
 137     movl(hi, lo);
 138     sarl(hi, 31);
 139   }
 140 }
 141 
 142 void MacroAssembler::jC2(Register tmp, Label& L) {
 143   // set parity bit if FPU flag C2 is set (via rax)
 144   save_rax(tmp);
 145   fwait(); fnstsw_ax();
 146   sahf();
 147   restore_rax(tmp);
 148   // branch
 149   jcc(Assembler::parity, L);
 150 }
 151 
 152 void MacroAssembler::jnC2(Register tmp, Label& L) {
 153   // set parity bit if FPU flag C2 is set (via rax)
 154   save_rax(tmp);
 155   fwait(); fnstsw_ax();
 156   sahf();
 157   restore_rax(tmp);
 158   // branch
 159   jcc(Assembler::noParity, L);
 160 }
 161 
 162 // 32bit can do a case table jump in one instruction but we no longer allow the base
 163 // to be installed in the Address class
 164 void MacroAssembler::jump(ArrayAddress entry) {
 165   jmp(as_Address(entry));
 166 }
 167 
 168 // Note: y_lo will be destroyed
 169 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 170   // Long compare for Java (semantics as described in JVM spec.)
 171   Label high, low, done;
 172 
 173   cmpl(x_hi, y_hi);
 174   jcc(Assembler::less, low);
 175   jcc(Assembler::greater, high);
 176   // x_hi is the return register
 177   xorl(x_hi, x_hi);
 178   cmpl(x_lo, y_lo);
 179   jcc(Assembler::below, low);
 180   jcc(Assembler::equal, done);
 181 
 182   bind(high);
 183   xorl(x_hi, x_hi);
 184   increment(x_hi);
 185   jmp(done);
 186 
 187   bind(low);
 188   xorl(x_hi, x_hi);
 189   decrementl(x_hi);
 190 
 191   bind(done);
 192 }
 193 
 194 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 195     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 196 }
 197 
 198 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 199   // leal(dst, as_Address(adr));
 200   // see note in movl as to why we must use a move
 201   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 202 }
 203 
 204 void MacroAssembler::leave() {
 205   mov(rsp, rbp);
 206   pop(rbp);
 207 }
 208 
 209 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 210   // Multiplication of two Java long values stored on the stack
 211   // as illustrated below. Result is in rdx:rax.
 212   //
 213   // rsp ---> [  ??  ] \               \
 214   //            ....    | y_rsp_offset  |
 215   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 216   //          [ y_hi ]                  | (in bytes)
 217   //            ....                    |
 218   //          [ x_lo ]                 /
 219   //          [ x_hi ]
 220   //            ....
 221   //
 222   // Basic idea: lo(result) = lo(x_lo * y_lo)
 223   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 224   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 225   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 226   Label quick;
 227   // load x_hi, y_hi and check if quick
 228   // multiplication is possible
 229   movl(rbx, x_hi);
 230   movl(rcx, y_hi);
 231   movl(rax, rbx);
 232   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 233   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 234   // do full multiplication
 235   // 1st step
 236   mull(y_lo);                                    // x_hi * y_lo
 237   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 238   // 2nd step
 239   movl(rax, x_lo);
 240   mull(rcx);                                     // x_lo * y_hi
 241   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 242   // 3rd step
 243   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 244   movl(rax, x_lo);
 245   mull(y_lo);                                    // x_lo * y_lo
 246   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 247 }
 248 
 249 void MacroAssembler::lneg(Register hi, Register lo) {
 250   negl(lo);
 251   adcl(hi, 0);
 252   negl(hi);
 253 }
 254 
 255 void MacroAssembler::lshl(Register hi, Register lo) {
 256   // Java shift left long support (semantics as described in JVM spec., p.305)
 257   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 258   // shift value is in rcx !
 259   assert(hi != rcx, "must not use rcx");
 260   assert(lo != rcx, "must not use rcx");
 261   const Register s = rcx;                        // shift count
 262   const int      n = BitsPerWord;
 263   Label L;
 264   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 265   cmpl(s, n);                                    // if (s < n)
 266   jcc(Assembler::less, L);                       // else (s >= n)
 267   movl(hi, lo);                                  // x := x << n
 268   xorl(lo, lo);
 269   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 270   bind(L);                                       // s (mod n) < n
 271   shldl(hi, lo);                                 // x := x << s
 272   shll(lo);
 273 }
 274 
 275 
 276 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 277   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 278   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 279   assert(hi != rcx, "must not use rcx");
 280   assert(lo != rcx, "must not use rcx");
 281   const Register s = rcx;                        // shift count
 282   const int      n = BitsPerWord;
 283   Label L;
 284   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 285   cmpl(s, n);                                    // if (s < n)
 286   jcc(Assembler::less, L);                       // else (s >= n)
 287   movl(lo, hi);                                  // x := x >> n
 288   if (sign_extension) sarl(hi, 31);
 289   else                xorl(hi, hi);
 290   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 291   bind(L);                                       // s (mod n) < n
 292   shrdl(lo, hi);                                 // x := x >> s
 293   if (sign_extension) sarl(hi);
 294   else                shrl(hi);
 295 }
 296 
 297 void MacroAssembler::movoop(Register dst, jobject obj) {
 298   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 299 }
 300 
 301 void MacroAssembler::movoop(Address dst, jobject obj) {
 302   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 303 }
 304 
 305 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 306   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 307 }
 308 
 309 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 310   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 311 }
 312 
 313 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 314   // scratch register is not used,
 315   // it is defined to match parameters of 64-bit version of this method.
 316   if (src.is_lval()) {
 317     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 318   } else {
 319     movl(dst, as_Address(src));
 320   }
 321 }
 322 
 323 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 324   movl(as_Address(dst), src);
 325 }
 326 
 327 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 328   movl(dst, as_Address(src));
 329 }
 330 
 331 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 332 void MacroAssembler::movptr(Address dst, intptr_t src) {
 333   movl(dst, src);
 334 }
 335 
 336 
 337 void MacroAssembler::pop_callee_saved_registers() {
 338   pop(rcx);
 339   pop(rdx);
 340   pop(rdi);
 341   pop(rsi);
 342 }
 343 
 344 void MacroAssembler::push_callee_saved_registers() {
 345   push(rsi);
 346   push(rdi);
 347   push(rdx);
 348   push(rcx);
 349 }
 350 
 351 void MacroAssembler::pushoop(jobject obj) {
 352   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 353 }
 354 
 355 void MacroAssembler::pushklass(Metadata* obj) {
 356   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 357 }
 358 
 359 void MacroAssembler::pushptr(AddressLiteral src) {
 360   if (src.is_lval()) {
 361     push_literal32((int32_t)src.target(), src.rspec());
 362   } else {
 363     pushl(as_Address(src));
 364   }
 365 }
 366 
 367 static void pass_arg0(MacroAssembler* masm, Register arg) {
 368   masm->push(arg);
 369 }
 370 
 371 static void pass_arg1(MacroAssembler* masm, Register arg) {
 372   masm->push(arg);
 373 }
 374 
 375 static void pass_arg2(MacroAssembler* masm, Register arg) {
 376   masm->push(arg);
 377 }
 378 
 379 static void pass_arg3(MacroAssembler* masm, Register arg) {
 380   masm->push(arg);
 381 }
 382 
 383 #ifndef PRODUCT
 384 extern "C" void findpc(intptr_t x);
 385 #endif
 386 
 387 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 388   // In order to get locks to work, we need to fake a in_VM state
 389   JavaThread* thread = JavaThread::current();
 390   JavaThreadState saved_state = thread->thread_state();
 391   thread->set_thread_state(_thread_in_vm);
 392   if (ShowMessageBoxOnError) {
 393     JavaThread* thread = JavaThread::current();
 394     JavaThreadState saved_state = thread->thread_state();
 395     thread->set_thread_state(_thread_in_vm);
 396     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 397       ttyLocker ttyl;
 398       BytecodeCounter::print();
 399     }
 400     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 401     // This is the value of eip which points to where verify_oop will return.
 402     if (os::message_box(msg, "Execution stopped, print registers?")) {
 403       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 404       BREAKPOINT;
 405     }
 406   }
 407   fatal("DEBUG MESSAGE: %s", msg);
 408 }
 409 
 410 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 411   ttyLocker ttyl;
 412   FlagSetting fs(Debugging, true);
 413   tty->print_cr("eip = 0x%08x", eip);
 414 #ifndef PRODUCT
 415   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 416     tty->cr();
 417     findpc(eip);
 418     tty->cr();
 419   }
 420 #endif
 421 #define PRINT_REG(rax) \
 422   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 423   PRINT_REG(rax);
 424   PRINT_REG(rbx);
 425   PRINT_REG(rcx);
 426   PRINT_REG(rdx);
 427   PRINT_REG(rdi);
 428   PRINT_REG(rsi);
 429   PRINT_REG(rbp);
 430   PRINT_REG(rsp);
 431 #undef PRINT_REG
 432   // Print some words near top of staack.
 433   int* dump_sp = (int*) rsp;
 434   for (int col1 = 0; col1 < 8; col1++) {
 435     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 436     os::print_location(tty, *dump_sp++);
 437   }
 438   for (int row = 0; row < 16; row++) {
 439     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 440     for (int col = 0; col < 8; col++) {
 441       tty->print(" 0x%08x", *dump_sp++);
 442     }
 443     tty->cr();
 444   }
 445   // Print some instructions around pc:
 446   Disassembler::decode((address)eip-64, (address)eip);
 447   tty->print_cr("--------");
 448   Disassembler::decode((address)eip, (address)eip+32);
 449 }
 450 
 451 void MacroAssembler::stop(const char* msg) {
 452   ExternalAddress message((address)msg);
 453   // push address of message
 454   pushptr(message.addr());
 455   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 456   pusha();                                            // push registers
 457   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 458   hlt();
 459 }
 460 
 461 void MacroAssembler::warn(const char* msg) {
 462   push_CPU_state();
 463 
 464   ExternalAddress message((address) msg);
 465   // push address of message
 466   pushptr(message.addr());
 467 
 468   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 469   addl(rsp, wordSize);       // discard argument
 470   pop_CPU_state();
 471 }
 472 
 473 void MacroAssembler::print_state() {
 474   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 475   pusha();                                            // push registers
 476 
 477   push_CPU_state();
 478   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 479   pop_CPU_state();
 480 
 481   popa();
 482   addl(rsp, wordSize);
 483 }
 484 
 485 #else // _LP64
 486 
 487 // 64 bit versions
 488 
 489 Address MacroAssembler::as_Address(AddressLiteral adr) {
 490   // amd64 always does this as a pc-rel
 491   // we can be absolute or disp based on the instruction type
 492   // jmp/call are displacements others are absolute
 493   assert(!adr.is_lval(), "must be rval");
 494   assert(reachable(adr), "must be");
 495   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 496 
 497 }
 498 
 499 Address MacroAssembler::as_Address(ArrayAddress adr) {
 500   AddressLiteral base = adr.base();
 501   lea(rscratch1, base);
 502   Address index = adr.index();
 503   assert(index._disp == 0, "must not have disp"); // maybe it can?
 504   Address array(rscratch1, index._index, index._scale, index._disp);
 505   return array;
 506 }
 507 
 508 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 509   Label L, E;
 510 
 511 #ifdef _WIN64
 512   // Windows always allocates space for it's register args
 513   assert(num_args <= 4, "only register arguments supported");
 514   subq(rsp,  frame::arg_reg_save_area_bytes);
 515 #endif
 516 
 517   // Align stack if necessary
 518   testl(rsp, 15);
 519   jcc(Assembler::zero, L);
 520 
 521   subq(rsp, 8);
 522   {
 523     call(RuntimeAddress(entry_point));
 524   }
 525   addq(rsp, 8);
 526   jmp(E);
 527 
 528   bind(L);
 529   {
 530     call(RuntimeAddress(entry_point));
 531   }
 532 
 533   bind(E);
 534 
 535 #ifdef _WIN64
 536   // restore stack pointer
 537   addq(rsp, frame::arg_reg_save_area_bytes);
 538 #endif
 539 
 540 }
 541 
 542 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 543   assert(!src2.is_lval(), "should use cmpptr");
 544 
 545   if (reachable(src2)) {
 546     cmpq(src1, as_Address(src2));
 547   } else {
 548     lea(rscratch1, src2);
 549     Assembler::cmpq(src1, Address(rscratch1, 0));
 550   }
 551 }
 552 
 553 int MacroAssembler::corrected_idivq(Register reg) {
 554   // Full implementation of Java ldiv and lrem; checks for special
 555   // case as described in JVM spec., p.243 & p.271.  The function
 556   // returns the (pc) offset of the idivl instruction - may be needed
 557   // for implicit exceptions.
 558   //
 559   //         normal case                           special case
 560   //
 561   // input : rax: dividend                         min_long
 562   //         reg: divisor   (may not be eax/edx)   -1
 563   //
 564   // output: rax: quotient  (= rax idiv reg)       min_long
 565   //         rdx: remainder (= rax irem reg)       0
 566   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 567   static const int64_t min_long = 0x8000000000000000;
 568   Label normal_case, special_case;
 569 
 570   // check for special case
 571   cmp64(rax, ExternalAddress((address) &min_long));
 572   jcc(Assembler::notEqual, normal_case);
 573   xorl(rdx, rdx); // prepare rdx for possible special case (where
 574                   // remainder = 0)
 575   cmpq(reg, -1);
 576   jcc(Assembler::equal, special_case);
 577 
 578   // handle normal case
 579   bind(normal_case);
 580   cdqq();
 581   int idivq_offset = offset();
 582   idivq(reg);
 583 
 584   // normal and special case exit
 585   bind(special_case);
 586 
 587   return idivq_offset;
 588 }
 589 
 590 void MacroAssembler::decrementq(Register reg, int value) {
 591   if (value == min_jint) { subq(reg, value); return; }
 592   if (value <  0) { incrementq(reg, -value); return; }
 593   if (value == 0) {                        ; return; }
 594   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 595   /* else */      { subq(reg, value)       ; return; }
 596 }
 597 
 598 void MacroAssembler::decrementq(Address dst, int value) {
 599   if (value == min_jint) { subq(dst, value); return; }
 600   if (value <  0) { incrementq(dst, -value); return; }
 601   if (value == 0) {                        ; return; }
 602   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 603   /* else */      { subq(dst, value)       ; return; }
 604 }
 605 
 606 void MacroAssembler::incrementq(AddressLiteral dst) {
 607   if (reachable(dst)) {
 608     incrementq(as_Address(dst));
 609   } else {
 610     lea(rscratch1, dst);
 611     incrementq(Address(rscratch1, 0));
 612   }
 613 }
 614 
 615 void MacroAssembler::incrementq(Register reg, int value) {
 616   if (value == min_jint) { addq(reg, value); return; }
 617   if (value <  0) { decrementq(reg, -value); return; }
 618   if (value == 0) {                        ; return; }
 619   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 620   /* else */      { addq(reg, value)       ; return; }
 621 }
 622 
 623 void MacroAssembler::incrementq(Address dst, int value) {
 624   if (value == min_jint) { addq(dst, value); return; }
 625   if (value <  0) { decrementq(dst, -value); return; }
 626   if (value == 0) {                        ; return; }
 627   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 628   /* else */      { addq(dst, value)       ; return; }
 629 }
 630 
 631 // 32bit can do a case table jump in one instruction but we no longer allow the base
 632 // to be installed in the Address class
 633 void MacroAssembler::jump(ArrayAddress entry) {
 634   lea(rscratch1, entry.base());
 635   Address dispatch = entry.index();
 636   assert(dispatch._base == noreg, "must be");
 637   dispatch._base = rscratch1;
 638   jmp(dispatch);
 639 }
 640 
 641 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 642   ShouldNotReachHere(); // 64bit doesn't use two regs
 643   cmpq(x_lo, y_lo);
 644 }
 645 
 646 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 647     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 648 }
 649 
 650 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 651   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 652   movptr(dst, rscratch1);
 653 }
 654 
 655 void MacroAssembler::leave() {
 656   // %%% is this really better? Why not on 32bit too?
 657   emit_int8((unsigned char)0xC9); // LEAVE
 658 }
 659 
 660 void MacroAssembler::lneg(Register hi, Register lo) {
 661   ShouldNotReachHere(); // 64bit doesn't use two regs
 662   negq(lo);
 663 }
 664 
 665 void MacroAssembler::movoop(Register dst, jobject obj) {
 666   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 667 }
 668 
 669 void MacroAssembler::movoop(Address dst, jobject obj) {
 670   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 671   movq(dst, rscratch1);
 672 }
 673 
 674 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 675   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 676 }
 677 
 678 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 679   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 680   movq(dst, rscratch1);
 681 }
 682 
 683 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 684   if (src.is_lval()) {
 685     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 686   } else {
 687     if (reachable(src)) {
 688       movq(dst, as_Address(src));
 689     } else {
 690       lea(scratch, src);
 691       movq(dst, Address(scratch, 0));
 692     }
 693   }
 694 }
 695 
 696 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 697   movq(as_Address(dst), src);
 698 }
 699 
 700 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 701   movq(dst, as_Address(src));
 702 }
 703 
 704 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 705 void MacroAssembler::movptr(Address dst, intptr_t src) {
 706   if (is_simm32(src)) {
 707     movptr(dst, checked_cast<int32_t>(src));
 708   } else {
 709     mov64(rscratch1, src);
 710     movq(dst, rscratch1);
 711   }
 712 }
 713 
 714 // These are mostly for initializing NULL
 715 void MacroAssembler::movptr(Address dst, int32_t src) {
 716   movslq(dst, src);
 717 }
 718 
 719 void MacroAssembler::movptr(Register dst, int32_t src) {
 720   mov64(dst, (intptr_t)src);
 721 }
 722 
 723 void MacroAssembler::pushoop(jobject obj) {
 724   movoop(rscratch1, obj);
 725   push(rscratch1);
 726 }
 727 
 728 void MacroAssembler::pushklass(Metadata* obj) {
 729   mov_metadata(rscratch1, obj);
 730   push(rscratch1);
 731 }
 732 
 733 void MacroAssembler::pushptr(AddressLiteral src) {
 734   lea(rscratch1, src);
 735   if (src.is_lval()) {
 736     push(rscratch1);
 737   } else {
 738     pushq(Address(rscratch1, 0));
 739   }
 740 }
 741 
 742 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 743   reset_last_Java_frame(r15_thread, clear_fp);
 744 }
 745 
 746 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 747                                          Register last_java_fp,
 748                                          address  last_java_pc) {
 749   vzeroupper();
 750   // determine last_java_sp register
 751   if (!last_java_sp->is_valid()) {
 752     last_java_sp = rsp;
 753   }
 754 
 755   // last_java_fp is optional
 756   if (last_java_fp->is_valid()) {
 757     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 758            last_java_fp);
 759   }
 760 
 761   // last_java_pc is optional
 762   if (last_java_pc != NULL) {
 763     Address java_pc(r15_thread,
 764                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 765     lea(rscratch1, InternalAddress(last_java_pc));
 766     movptr(java_pc, rscratch1);
 767   }
 768 
 769   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 770 }
 771 
 772 static void pass_arg0(MacroAssembler* masm, Register arg) {
 773   if (c_rarg0 != arg ) {
 774     masm->mov(c_rarg0, arg);
 775   }
 776 }
 777 
 778 static void pass_arg1(MacroAssembler* masm, Register arg) {
 779   if (c_rarg1 != arg ) {
 780     masm->mov(c_rarg1, arg);
 781   }
 782 }
 783 
 784 static void pass_arg2(MacroAssembler* masm, Register arg) {
 785   if (c_rarg2 != arg ) {
 786     masm->mov(c_rarg2, arg);
 787   }
 788 }
 789 
 790 static void pass_arg3(MacroAssembler* masm, Register arg) {
 791   if (c_rarg3 != arg ) {
 792     masm->mov(c_rarg3, arg);
 793   }
 794 }
 795 
 796 void MacroAssembler::stop(const char* msg) {
 797   if (ShowMessageBoxOnError) {
 798     address rip = pc();
 799     pusha(); // get regs on stack
 800     lea(c_rarg1, InternalAddress(rip));
 801     movq(c_rarg2, rsp); // pass pointer to regs array
 802   }
 803   lea(c_rarg0, ExternalAddress((address) msg));
 804   andq(rsp, -16); // align stack as required by ABI
 805   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 806   hlt();
 807 }
 808 
 809 void MacroAssembler::warn(const char* msg) {
 810   push(rbp);
 811   movq(rbp, rsp);
 812   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 813   push_CPU_state();   // keeps alignment at 16 bytes
 814   lea(c_rarg0, ExternalAddress((address) msg));
 815   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 816   call(rax);
 817   pop_CPU_state();
 818   mov(rsp, rbp);
 819   pop(rbp);
 820 }
 821 
 822 void MacroAssembler::print_state() {
 823   address rip = pc();
 824   pusha();            // get regs on stack
 825   push(rbp);
 826   movq(rbp, rsp);
 827   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 828   push_CPU_state();   // keeps alignment at 16 bytes
 829 
 830   lea(c_rarg0, InternalAddress(rip));
 831   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 832   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 833 
 834   pop_CPU_state();
 835   mov(rsp, rbp);
 836   pop(rbp);
 837   popa();
 838 }
 839 
 840 #ifndef PRODUCT
 841 extern "C" void findpc(intptr_t x);
 842 #endif
 843 
 844 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 845   // In order to get locks to work, we need to fake a in_VM state
 846   if (ShowMessageBoxOnError) {
 847     JavaThread* thread = JavaThread::current();
 848     JavaThreadState saved_state = thread->thread_state();
 849     thread->set_thread_state(_thread_in_vm);
 850 #ifndef PRODUCT
 851     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 852       ttyLocker ttyl;
 853       BytecodeCounter::print();
 854     }
 855 #endif
 856     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 857     // XXX correct this offset for amd64
 858     // This is the value of eip which points to where verify_oop will return.
 859     if (os::message_box(msg, "Execution stopped, print registers?")) {
 860       print_state64(pc, regs);
 861       BREAKPOINT;
 862     }
 863   }
 864   fatal("DEBUG MESSAGE: %s", msg);
 865 }
 866 
 867 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 868   ttyLocker ttyl;
 869   FlagSetting fs(Debugging, true);
 870   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 871 #ifndef PRODUCT
 872   tty->cr();
 873   findpc(pc);
 874   tty->cr();
 875 #endif
 876 #define PRINT_REG(rax, value) \
 877   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 878   PRINT_REG(rax, regs[15]);
 879   PRINT_REG(rbx, regs[12]);
 880   PRINT_REG(rcx, regs[14]);
 881   PRINT_REG(rdx, regs[13]);
 882   PRINT_REG(rdi, regs[8]);
 883   PRINT_REG(rsi, regs[9]);
 884   PRINT_REG(rbp, regs[10]);
 885   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 886   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 887   PRINT_REG(r8 , regs[7]);
 888   PRINT_REG(r9 , regs[6]);
 889   PRINT_REG(r10, regs[5]);
 890   PRINT_REG(r11, regs[4]);
 891   PRINT_REG(r12, regs[3]);
 892   PRINT_REG(r13, regs[2]);
 893   PRINT_REG(r14, regs[1]);
 894   PRINT_REG(r15, regs[0]);
 895 #undef PRINT_REG
 896   // Print some words near the top of the stack.
 897   int64_t* rsp = &regs[16];
 898   int64_t* dump_sp = rsp;
 899   for (int col1 = 0; col1 < 8; col1++) {
 900     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 901     os::print_location(tty, *dump_sp++);
 902   }
 903   for (int row = 0; row < 25; row++) {
 904     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 905     for (int col = 0; col < 4; col++) {
 906       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 907     }
 908     tty->cr();
 909   }
 910   // Print some instructions around pc:
 911   Disassembler::decode((address)pc-64, (address)pc);
 912   tty->print_cr("--------");
 913   Disassembler::decode((address)pc, (address)pc+32);
 914 }
 915 
 916 // The java_calling_convention describes stack locations as ideal slots on
 917 // a frame with no abi restrictions. Since we must observe abi restrictions
 918 // (like the placement of the register window) the slots must be biased by
 919 // the following value.
 920 static int reg2offset_in(VMReg r) {
 921   // Account for saved rbp and return address
 922   // This should really be in_preserve_stack_slots
 923   return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
 924 }
 925 
 926 static int reg2offset_out(VMReg r) {
 927   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
 928 }
 929 
 930 // A long move
 931 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst) {
 932 
 933   // The calling conventions assures us that each VMregpair is either
 934   // all really one physical register or adjacent stack slots.
 935 
 936   if (src.is_single_phys_reg() ) {
 937     if (dst.is_single_phys_reg()) {
 938       if (dst.first() != src.first()) {
 939         mov(dst.first()->as_Register(), src.first()->as_Register());
 940       }
 941     } else {
 942       assert(dst.is_single_reg(), "not a stack pair");
 943       movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
 944     }
 945   } else if (dst.is_single_phys_reg()) {
 946     assert(src.is_single_reg(),  "not a stack pair");
 947     movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
 948   } else {
 949     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 950     movq(rax, Address(rbp, reg2offset_in(src.first())));
 951     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 952   }
 953 }
 954 
 955 // A double move
 956 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst) {
 957 
 958   // The calling conventions assures us that each VMregpair is either
 959   // all really one physical register or adjacent stack slots.
 960 
 961   if (src.is_single_phys_reg() ) {
 962     if (dst.is_single_phys_reg()) {
 963       // In theory these overlap but the ordering is such that this is likely a nop
 964       if ( src.first() != dst.first()) {
 965         movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
 966       }
 967     } else {
 968       assert(dst.is_single_reg(), "not a stack pair");
 969       movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
 970     }
 971   } else if (dst.is_single_phys_reg()) {
 972     assert(src.is_single_reg(),  "not a stack pair");
 973     movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
 974   } else {
 975     assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
 976     movq(rax, Address(rbp, reg2offset_in(src.first())));
 977     movq(Address(rsp, reg2offset_out(dst.first())), rax);
 978   }
 979 }
 980 
 981 
 982 // A float arg may have to do float reg int reg conversion
 983 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst) {
 984   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
 985 
 986   // The calling conventions assures us that each VMregpair is either
 987   // all really one physical register or adjacent stack slots.
 988 
 989   if (src.first()->is_stack()) {
 990     if (dst.first()->is_stack()) {
 991       movl(rax, Address(rbp, reg2offset_in(src.first())));
 992       movptr(Address(rsp, reg2offset_out(dst.first())), rax);
 993     } else {
 994       // stack to reg
 995       assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
 996       movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
 997     }
 998   } else if (dst.first()->is_stack()) {
 999     // reg to stack
1000     assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
1001     movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
1002   } else {
1003     // reg to reg
1004     // In theory these overlap but the ordering is such that this is likely a nop
1005     if ( src.first() != dst.first()) {
1006       movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
1007     }
1008   }
1009 }
1010 
1011 // On 64 bit we will store integer like items to the stack as
1012 // 64 bits items (x86_32/64 abi) even though java would only store
1013 // 32bits for a parameter. On 32bit it will simply be 32 bits
1014 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
1015 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst) {
1016   if (src.first()->is_stack()) {
1017     if (dst.first()->is_stack()) {
1018       // stack to stack
1019       movslq(rax, Address(rbp, reg2offset_in(src.first())));
1020       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1021     } else {
1022       // stack to reg
1023       movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1024     }
1025   } else if (dst.first()->is_stack()) {
1026     // reg to stack
1027     // Do we really have to sign extend???
1028     // __ movslq(src.first()->as_Register(), src.first()->as_Register());
1029     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1030   } else {
1031     // Do we really have to sign extend???
1032     // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1033     if (dst.first() != src.first()) {
1034       movq(dst.first()->as_Register(), src.first()->as_Register());
1035     }
1036   }
1037 }
1038 
1039 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1040   if (src.first()->is_stack()) {
1041     if (dst.first()->is_stack()) {
1042       // stack to stack
1043       movq(rax, Address(rbp, reg2offset_in(src.first())));
1044       movq(Address(rsp, reg2offset_out(dst.first())), rax);
1045     } else {
1046       // stack to reg
1047       movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1048     }
1049   } else if (dst.first()->is_stack()) {
1050     // reg to stack
1051     movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1052   } else {
1053     if (dst.first() != src.first()) {
1054       movq(dst.first()->as_Register(), src.first()->as_Register());
1055     }
1056   }
1057 }
1058 
1059 // An oop arg. Must pass a handle not the oop itself
1060 void MacroAssembler::object_move(OopMap* map,
1061                         int oop_handle_offset,
1062                         int framesize_in_slots,
1063                         VMRegPair src,
1064                         VMRegPair dst,
1065                         bool is_receiver,
1066                         int* receiver_offset) {
1067 
1068   // must pass a handle. First figure out the location we use as a handle
1069 
1070   Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1071 
1072   // See if oop is NULL if it is we need no handle
1073 
1074   if (src.first()->is_stack()) {
1075 
1076     // Oop is already on the stack as an argument
1077     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1078     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1079     if (is_receiver) {
1080       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1081     }
1082 
1083     cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
1084     lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1085     // conditionally move a NULL
1086     cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1087   } else {
1088 
1089     // Oop is in an a register we must store it to the space we reserve
1090     // on the stack for oop_handles and pass a handle if oop is non-NULL
1091 
1092     const Register rOop = src.first()->as_Register();
1093     int oop_slot;
1094     if (rOop == j_rarg0)
1095       oop_slot = 0;
1096     else if (rOop == j_rarg1)
1097       oop_slot = 1;
1098     else if (rOop == j_rarg2)
1099       oop_slot = 2;
1100     else if (rOop == j_rarg3)
1101       oop_slot = 3;
1102     else if (rOop == j_rarg4)
1103       oop_slot = 4;
1104     else {
1105       assert(rOop == j_rarg5, "wrong register");
1106       oop_slot = 5;
1107     }
1108 
1109     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1110     int offset = oop_slot*VMRegImpl::stack_slot_size;
1111 
1112     map->set_oop(VMRegImpl::stack2reg(oop_slot));
1113     // Store oop in handle area, may be NULL
1114     movptr(Address(rsp, offset), rOop);
1115     if (is_receiver) {
1116       *receiver_offset = offset;
1117     }
1118 
1119     cmpptr(rOop, (int32_t)NULL_WORD);
1120     lea(rHandle, Address(rsp, offset));
1121     // conditionally move a NULL from the handle area where it was just stored
1122     cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1123   }
1124 
1125   // If arg is on the stack then place it otherwise it is already in correct reg.
1126   if (dst.first()->is_stack()) {
1127     movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1128   }
1129 }
1130 
1131 #endif // _LP64
1132 
1133 // Now versions that are common to 32/64 bit
1134 
1135 void MacroAssembler::addptr(Register dst, int32_t imm32) {
1136   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1137 }
1138 
1139 void MacroAssembler::addptr(Register dst, Register src) {
1140   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1141 }
1142 
1143 void MacroAssembler::addptr(Address dst, Register src) {
1144   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1145 }
1146 
1147 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
1148   if (reachable(src)) {
1149     Assembler::addsd(dst, as_Address(src));
1150   } else {
1151     lea(rscratch1, src);
1152     Assembler::addsd(dst, Address(rscratch1, 0));
1153   }
1154 }
1155 
1156 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
1157   if (reachable(src)) {
1158     addss(dst, as_Address(src));
1159   } else {
1160     lea(rscratch1, src);
1161     addss(dst, Address(rscratch1, 0));
1162   }
1163 }
1164 
1165 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
1166   if (reachable(src)) {
1167     Assembler::addpd(dst, as_Address(src));
1168   } else {
1169     lea(rscratch1, src);
1170     Assembler::addpd(dst, Address(rscratch1, 0));
1171   }
1172 }
1173 
1174 // See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1175 // Stub code is generated once and never copied.
1176 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1177 void MacroAssembler::align64() {
1178   align(64, (unsigned long long) pc());
1179 }
1180 
1181 void MacroAssembler::align32() {
1182   align(32, (unsigned long long) pc());
1183 }
1184 
1185 void MacroAssembler::align(int modulus) {
1186   // 8273459: Ensure alignment is possible with current segment alignment
1187   assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1188   align(modulus, offset());
1189 }
1190 
1191 void MacroAssembler::align(int modulus, int target) {
1192   if (target % modulus != 0) {
1193     nop(modulus - (target % modulus));
1194   }
1195 }
1196 
1197 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1198   // Used in sign-masking with aligned address.
1199   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1200   if (reachable(src)) {
1201     Assembler::andpd(dst, as_Address(src));
1202   } else {
1203     lea(scratch_reg, src);
1204     Assembler::andpd(dst, Address(scratch_reg, 0));
1205   }
1206 }
1207 
1208 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1209   // Used in sign-masking with aligned address.
1210   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1211   if (reachable(src)) {
1212     Assembler::andps(dst, as_Address(src));
1213   } else {
1214     lea(scratch_reg, src);
1215     Assembler::andps(dst, Address(scratch_reg, 0));
1216   }
1217 }
1218 
1219 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1220   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1221 }
1222 
1223 void MacroAssembler::atomic_incl(Address counter_addr) {
1224   lock();
1225   incrementl(counter_addr);
1226 }
1227 
1228 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1229   if (reachable(counter_addr)) {
1230     atomic_incl(as_Address(counter_addr));
1231   } else {
1232     lea(scr, counter_addr);
1233     atomic_incl(Address(scr, 0));
1234   }
1235 }
1236 
1237 #ifdef _LP64
1238 void MacroAssembler::atomic_incq(Address counter_addr) {
1239   lock();
1240   incrementq(counter_addr);
1241 }
1242 
1243 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1244   if (reachable(counter_addr)) {
1245     atomic_incq(as_Address(counter_addr));
1246   } else {
1247     lea(scr, counter_addr);
1248     atomic_incq(Address(scr, 0));
1249   }
1250 }
1251 #endif
1252 
1253 // Writes to stack successive pages until offset reached to check for
1254 // stack overflow + shadow pages.  This clobbers tmp.
1255 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1256   movptr(tmp, rsp);
1257   // Bang stack for total size given plus shadow page size.
1258   // Bang one page at a time because large size can bang beyond yellow and
1259   // red zones.
1260   Label loop;
1261   bind(loop);
1262   movl(Address(tmp, (-os::vm_page_size())), size );
1263   subptr(tmp, os::vm_page_size());
1264   subl(size, os::vm_page_size());
1265   jcc(Assembler::greater, loop);
1266 
1267   // Bang down shadow pages too.
1268   // At this point, (tmp-0) is the last address touched, so don't
1269   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1270   // was post-decremented.)  Skip this address by starting at i=1, and
1271   // touch a few more pages below.  N.B.  It is important to touch all
1272   // the way down including all pages in the shadow zone.
1273   for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1274     // this could be any sized move but this is can be a debugging crumb
1275     // so the bigger the better.
1276     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1277   }
1278 }
1279 
1280 void MacroAssembler::reserved_stack_check() {
1281     // testing if reserved zone needs to be enabled
1282     Label no_reserved_zone_enabling;
1283     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1284     NOT_LP64(get_thread(rsi);)
1285 
1286     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1287     jcc(Assembler::below, no_reserved_zone_enabling);
1288 
1289     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1290     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1291     should_not_reach_here();
1292 
1293     bind(no_reserved_zone_enabling);
1294 }
1295 
1296 void MacroAssembler::biased_locking_enter(Register lock_reg,
1297                                           Register obj_reg,
1298                                           Register swap_reg,
1299                                           Register tmp_reg,
1300                                           Register tmp_reg2,
1301                                           bool swap_reg_contains_mark,
1302                                           Label& done,
1303                                           Label* slow_case,
1304                                           BiasedLockingCounters* counters) {
1305   assert(UseBiasedLocking, "why call this otherwise?");
1306   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1307   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1308   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1309   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
1310   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1311   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1312 
1313   if (PrintBiasedLockingStatistics && counters == NULL) {
1314     counters = BiasedLocking::counters();
1315   }
1316   // Biased locking
1317   // See whether the lock is currently biased toward our thread and
1318   // whether the epoch is still valid
1319   // Note that the runtime guarantees sufficient alignment of JavaThread
1320   // pointers to allow age to be placed into low bits
1321   // First check to see whether biasing is even enabled for this object
1322   Label cas_label;
1323   if (!swap_reg_contains_mark) {
1324     movptr(swap_reg, mark_addr);
1325   }
1326   movptr(tmp_reg, swap_reg);
1327   andptr(tmp_reg, markWord::biased_lock_mask_in_place);
1328   cmpptr(tmp_reg, markWord::biased_lock_pattern);
1329   jcc(Assembler::notEqual, cas_label);
1330   // The bias pattern is present in the object's header. Need to check
1331   // whether the bias owner and the epoch are both still current.
1332 #ifndef _LP64
1333   // Note that because there is no current thread register on x86_32 we
1334   // need to store off the mark word we read out of the object to
1335   // avoid reloading it and needing to recheck invariants below. This
1336   // store is unfortunate but it makes the overall code shorter and
1337   // simpler.
1338   movptr(saved_mark_addr, swap_reg);
1339 #endif
1340   load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1341 #ifdef _LP64
1342   orptr(tmp_reg, r15_thread);
1343   xorptr(tmp_reg, swap_reg);
1344   Register header_reg = tmp_reg;
1345 #else
1346   xorptr(tmp_reg, swap_reg);
1347   get_thread(swap_reg);
1348   xorptr(swap_reg, tmp_reg);
1349   Register header_reg = swap_reg;
1350 #endif
1351   andptr(header_reg, ~((int) markWord::age_mask_in_place));
1352   if (counters != NULL) {
1353     cond_inc32(Assembler::zero,
1354                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1355   }
1356   jcc(Assembler::equal, done);
1357 
1358   Label try_revoke_bias;
1359   Label try_rebias;
1360 
1361   // At this point we know that the header has the bias pattern and
1362   // that we are not the bias owner in the current epoch. We need to
1363   // figure out more details about the state of the header in order to
1364   // know what operations can be legally performed on the object's
1365   // header.
1366 
1367   // If the low three bits in the xor result aren't clear, that means
1368   // the prototype header is no longer biased and we have to revoke
1369   // the bias on this object.
1370   testptr(header_reg, markWord::biased_lock_mask_in_place);
1371   jcc(Assembler::notZero, try_revoke_bias);
1372 
1373   // Biasing is still enabled for this data type. See whether the
1374   // epoch of the current bias is still valid, meaning that the epoch
1375   // bits of the mark word are equal to the epoch bits of the
1376   // prototype header. (Note that the prototype header's epoch bits
1377   // only change at a safepoint.) If not, attempt to rebias the object
1378   // toward the current thread. Note that we must be absolutely sure
1379   // that the current epoch is invalid in order to do this because
1380   // otherwise the manipulations it performs on the mark word are
1381   // illegal.
1382   testptr(header_reg, markWord::epoch_mask_in_place);
1383   jccb(Assembler::notZero, try_rebias);
1384 
1385   // The epoch of the current bias is still valid but we know nothing
1386   // about the owner; it might be set or it might be clear. Try to
1387   // acquire the bias of the object using an atomic operation. If this
1388   // fails we will go in to the runtime to revoke the object's bias.
1389   // Note that we first construct the presumed unbiased header so we
1390   // don't accidentally blow away another thread's valid bias.
1391   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1392   andptr(swap_reg,
1393          markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
1394 #ifdef _LP64
1395   movptr(tmp_reg, swap_reg);
1396   orptr(tmp_reg, r15_thread);
1397 #else
1398   get_thread(tmp_reg);
1399   orptr(tmp_reg, swap_reg);
1400 #endif
1401   lock();
1402   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1403   // If the biasing toward our thread failed, this means that
1404   // another thread succeeded in biasing it toward itself and we
1405   // need to revoke that bias. The revocation will occur in the
1406   // interpreter runtime in the slow case.
1407   if (counters != NULL) {
1408     cond_inc32(Assembler::zero,
1409                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1410   }
1411   if (slow_case != NULL) {
1412     jcc(Assembler::notZero, *slow_case);
1413   }
1414   jmp(done);
1415 
1416   bind(try_rebias);
1417   // At this point we know the epoch has expired, meaning that the
1418   // current "bias owner", if any, is actually invalid. Under these
1419   // circumstances _only_, we are allowed to use the current header's
1420   // value as the comparison value when doing the cas to acquire the
1421   // bias in the current epoch. In other words, we allow transfer of
1422   // the bias from one thread to another directly in this situation.
1423   //
1424   // FIXME: due to a lack of registers we currently blow away the age
1425   // bits in this situation. Should attempt to preserve them.
1426   load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1427 #ifdef _LP64
1428   orptr(tmp_reg, r15_thread);
1429 #else
1430   get_thread(swap_reg);
1431   orptr(tmp_reg, swap_reg);
1432   movptr(swap_reg, saved_mark_addr);
1433 #endif
1434   lock();
1435   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1436   // If the biasing toward our thread failed, then another thread
1437   // succeeded in biasing it toward itself and we need to revoke that
1438   // bias. The revocation will occur in the runtime in the slow case.
1439   if (counters != NULL) {
1440     cond_inc32(Assembler::zero,
1441                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1442   }
1443   if (slow_case != NULL) {
1444     jcc(Assembler::notZero, *slow_case);
1445   }
1446   jmp(done);
1447 
1448   bind(try_revoke_bias);
1449   // The prototype mark in the klass doesn't have the bias bit set any
1450   // more, indicating that objects of this data type are not supposed
1451   // to be biased any more. We are going to try to reset the mark of
1452   // this object to the prototype value and fall through to the
1453   // CAS-based locking scheme. Note that if our CAS fails, it means
1454   // that another thread raced us for the privilege of revoking the
1455   // bias of this particular object, so it's okay to continue in the
1456   // normal locking code.
1457   //
1458   // FIXME: due to a lack of registers we currently blow away the age
1459   // bits in this situation. Should attempt to preserve them.
1460   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1461   load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1462   lock();
1463   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1464   // Fall through to the normal CAS-based lock, because no matter what
1465   // the result of the above CAS, some thread must have succeeded in
1466   // removing the bias bit from the object's header.
1467   if (counters != NULL) {
1468     cond_inc32(Assembler::zero,
1469                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1470   }
1471 
1472   bind(cas_label);
1473 }
1474 
1475 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1476   assert(UseBiasedLocking, "why call this otherwise?");
1477 
1478   // Check for biased locking unlock case, which is a no-op
1479   // Note: we do not have to check the thread ID for two reasons.
1480   // First, the interpreter checks for IllegalMonitorStateException at
1481   // a higher level. Second, if the bias was revoked while we held the
1482   // lock, the object could not be rebiased toward another thread, so
1483   // the bias bit would be clear.
1484   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1485   andptr(temp_reg, markWord::biased_lock_mask_in_place);
1486   cmpptr(temp_reg, markWord::biased_lock_pattern);
1487   jcc(Assembler::equal, done);
1488 }
1489 
1490 void MacroAssembler::c2bool(Register x) {
1491   // implements x == 0 ? 0 : 1
1492   // note: must only look at least-significant byte of x
1493   //       since C-style booleans are stored in one byte
1494   //       only! (was bug)
1495   andl(x, 0xFF);
1496   setb(Assembler::notZero, x);
1497 }
1498 
1499 // Wouldn't need if AddressLiteral version had new name
1500 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1501   Assembler::call(L, rtype);
1502 }
1503 
1504 void MacroAssembler::call(Register entry) {
1505   Assembler::call(entry);
1506 }
1507 
1508 void MacroAssembler::call(AddressLiteral entry) {
1509   if (reachable(entry)) {
1510     Assembler::call_literal(entry.target(), entry.rspec());
1511   } else {
1512     lea(rscratch1, entry);
1513     Assembler::call(rscratch1);
1514   }
1515 }
1516 
1517 void MacroAssembler::ic_call(address entry, jint method_index) {
1518   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1519   movptr(rax, (intptr_t)Universe::non_oop_word());
1520   call(AddressLiteral(entry, rh));
1521 }
1522 
1523 // Implementation of call_VM versions
1524 
1525 void MacroAssembler::call_VM(Register oop_result,
1526                              address entry_point,
1527                              bool check_exceptions) {
1528   Label C, E;
1529   call(C, relocInfo::none);
1530   jmp(E);
1531 
1532   bind(C);
1533   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1534   ret(0);
1535 
1536   bind(E);
1537 }
1538 
1539 void MacroAssembler::call_VM(Register oop_result,
1540                              address entry_point,
1541                              Register arg_1,
1542                              bool check_exceptions) {
1543   Label C, E;
1544   call(C, relocInfo::none);
1545   jmp(E);
1546 
1547   bind(C);
1548   pass_arg1(this, arg_1);
1549   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1550   ret(0);
1551 
1552   bind(E);
1553 }
1554 
1555 void MacroAssembler::call_VM(Register oop_result,
1556                              address entry_point,
1557                              Register arg_1,
1558                              Register arg_2,
1559                              bool check_exceptions) {
1560   Label C, E;
1561   call(C, relocInfo::none);
1562   jmp(E);
1563 
1564   bind(C);
1565 
1566   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1567 
1568   pass_arg2(this, arg_2);
1569   pass_arg1(this, arg_1);
1570   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1571   ret(0);
1572 
1573   bind(E);
1574 }
1575 
1576 void MacroAssembler::call_VM(Register oop_result,
1577                              address entry_point,
1578                              Register arg_1,
1579                              Register arg_2,
1580                              Register arg_3,
1581                              bool check_exceptions) {
1582   Label C, E;
1583   call(C, relocInfo::none);
1584   jmp(E);
1585 
1586   bind(C);
1587 
1588   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1589   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1590   pass_arg3(this, arg_3);
1591 
1592   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1593   pass_arg2(this, arg_2);
1594 
1595   pass_arg1(this, arg_1);
1596   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1597   ret(0);
1598 
1599   bind(E);
1600 }
1601 
1602 void MacroAssembler::call_VM(Register oop_result,
1603                              Register last_java_sp,
1604                              address entry_point,
1605                              int number_of_arguments,
1606                              bool check_exceptions) {
1607   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1608   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1609 }
1610 
1611 void MacroAssembler::call_VM(Register oop_result,
1612                              Register last_java_sp,
1613                              address entry_point,
1614                              Register arg_1,
1615                              bool check_exceptions) {
1616   pass_arg1(this, arg_1);
1617   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1618 }
1619 
1620 void MacroAssembler::call_VM(Register oop_result,
1621                              Register last_java_sp,
1622                              address entry_point,
1623                              Register arg_1,
1624                              Register arg_2,
1625                              bool check_exceptions) {
1626 
1627   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1628   pass_arg2(this, arg_2);
1629   pass_arg1(this, arg_1);
1630   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1631 }
1632 
1633 void MacroAssembler::call_VM(Register oop_result,
1634                              Register last_java_sp,
1635                              address entry_point,
1636                              Register arg_1,
1637                              Register arg_2,
1638                              Register arg_3,
1639                              bool check_exceptions) {
1640   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1641   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1642   pass_arg3(this, arg_3);
1643   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1644   pass_arg2(this, arg_2);
1645   pass_arg1(this, arg_1);
1646   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1647 }
1648 
1649 void MacroAssembler::super_call_VM(Register oop_result,
1650                                    Register last_java_sp,
1651                                    address entry_point,
1652                                    int number_of_arguments,
1653                                    bool check_exceptions) {
1654   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1655   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1656 }
1657 
1658 void MacroAssembler::super_call_VM(Register oop_result,
1659                                    Register last_java_sp,
1660                                    address entry_point,
1661                                    Register arg_1,
1662                                    bool check_exceptions) {
1663   pass_arg1(this, arg_1);
1664   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1665 }
1666 
1667 void MacroAssembler::super_call_VM(Register oop_result,
1668                                    Register last_java_sp,
1669                                    address entry_point,
1670                                    Register arg_1,
1671                                    Register arg_2,
1672                                    bool check_exceptions) {
1673 
1674   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1675   pass_arg2(this, arg_2);
1676   pass_arg1(this, arg_1);
1677   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1678 }
1679 
1680 void MacroAssembler::super_call_VM(Register oop_result,
1681                                    Register last_java_sp,
1682                                    address entry_point,
1683                                    Register arg_1,
1684                                    Register arg_2,
1685                                    Register arg_3,
1686                                    bool check_exceptions) {
1687   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1688   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1689   pass_arg3(this, arg_3);
1690   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1691   pass_arg2(this, arg_2);
1692   pass_arg1(this, arg_1);
1693   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1694 }
1695 
1696 void MacroAssembler::call_VM_base(Register oop_result,
1697                                   Register java_thread,
1698                                   Register last_java_sp,
1699                                   address  entry_point,
1700                                   int      number_of_arguments,
1701                                   bool     check_exceptions) {
1702   // determine java_thread register
1703   if (!java_thread->is_valid()) {
1704 #ifdef _LP64
1705     java_thread = r15_thread;
1706 #else
1707     java_thread = rdi;
1708     get_thread(java_thread);
1709 #endif // LP64
1710   }
1711   // determine last_java_sp register
1712   if (!last_java_sp->is_valid()) {
1713     last_java_sp = rsp;
1714   }
1715   // debugging support
1716   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1717   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1718 #ifdef ASSERT
1719   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1720   // r12 is the heapbase.
1721   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1722 #endif // ASSERT
1723 
1724   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1725   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1726 
1727   // push java thread (becomes first argument of C function)
1728 
1729   NOT_LP64(push(java_thread); number_of_arguments++);
1730   LP64_ONLY(mov(c_rarg0, r15_thread));
1731 
1732   // set last Java frame before call
1733   assert(last_java_sp != rbp, "can't use ebp/rbp");
1734 
1735   // Only interpreter should have to set fp
1736   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1737 
1738   // do the call, remove parameters
1739   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1740 
1741   // restore the thread (cannot use the pushed argument since arguments
1742   // may be overwritten by C code generated by an optimizing compiler);
1743   // however can use the register value directly if it is callee saved.
1744   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1745     // rdi & rsi (also r15) are callee saved -> nothing to do
1746 #ifdef ASSERT
1747     guarantee(java_thread != rax, "change this code");
1748     push(rax);
1749     { Label L;
1750       get_thread(rax);
1751       cmpptr(java_thread, rax);
1752       jcc(Assembler::equal, L);
1753       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1754       bind(L);
1755     }
1756     pop(rax);
1757 #endif
1758   } else {
1759     get_thread(java_thread);
1760   }
1761   // reset last Java frame
1762   // Only interpreter should have to clear fp
1763   reset_last_Java_frame(java_thread, true);
1764 
1765    // C++ interp handles this in the interpreter
1766   check_and_handle_popframe(java_thread);
1767   check_and_handle_earlyret(java_thread);
1768 
1769   if (check_exceptions) {
1770     // check for pending exceptions (java_thread is set upon return)
1771     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1772 #ifndef _LP64
1773     jump_cc(Assembler::notEqual,
1774             RuntimeAddress(StubRoutines::forward_exception_entry()));
1775 #else
1776     // This used to conditionally jump to forward_exception however it is
1777     // possible if we relocate that the branch will not reach. So we must jump
1778     // around so we can always reach
1779 
1780     Label ok;
1781     jcc(Assembler::equal, ok);
1782     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1783     bind(ok);
1784 #endif // LP64
1785   }
1786 
1787   // get oop result if there is one and reset the value in the thread
1788   if (oop_result->is_valid()) {
1789     get_vm_result(oop_result, java_thread);
1790   }
1791 }
1792 
1793 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1794 
1795   // Calculate the value for last_Java_sp
1796   // somewhat subtle. call_VM does an intermediate call
1797   // which places a return address on the stack just under the
1798   // stack pointer as the user finsihed with it. This allows
1799   // use to retrieve last_Java_pc from last_Java_sp[-1].
1800   // On 32bit we then have to push additional args on the stack to accomplish
1801   // the actual requested call. On 64bit call_VM only can use register args
1802   // so the only extra space is the return address that call_VM created.
1803   // This hopefully explains the calculations here.
1804 
1805 #ifdef _LP64
1806   // We've pushed one address, correct last_Java_sp
1807   lea(rax, Address(rsp, wordSize));
1808 #else
1809   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1810 #endif // LP64
1811 
1812   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1813 
1814 }
1815 
1816 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1817 void MacroAssembler::call_VM_leaf0(address entry_point) {
1818   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1819 }
1820 
1821 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1822   call_VM_leaf_base(entry_point, number_of_arguments);
1823 }
1824 
1825 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1826   pass_arg0(this, arg_0);
1827   call_VM_leaf(entry_point, 1);
1828 }
1829 
1830 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1831 
1832   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1833   pass_arg1(this, arg_1);
1834   pass_arg0(this, arg_0);
1835   call_VM_leaf(entry_point, 2);
1836 }
1837 
1838 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1839   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1840   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1841   pass_arg2(this, arg_2);
1842   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1843   pass_arg1(this, arg_1);
1844   pass_arg0(this, arg_0);
1845   call_VM_leaf(entry_point, 3);
1846 }
1847 
1848 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1849   pass_arg0(this, arg_0);
1850   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1851 }
1852 
1853 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1854 
1855   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1856   pass_arg1(this, arg_1);
1857   pass_arg0(this, arg_0);
1858   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1859 }
1860 
1861 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1862   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1863   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1864   pass_arg2(this, arg_2);
1865   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1866   pass_arg1(this, arg_1);
1867   pass_arg0(this, arg_0);
1868   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1869 }
1870 
1871 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1872   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1873   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1874   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1875   pass_arg3(this, arg_3);
1876   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1877   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1878   pass_arg2(this, arg_2);
1879   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1880   pass_arg1(this, arg_1);
1881   pass_arg0(this, arg_0);
1882   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1883 }
1884 
1885 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1886   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1887   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1888   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1889 }
1890 
1891 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1892   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1893   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1894 }
1895 
1896 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1897 }
1898 
1899 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1900 }
1901 
1902 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1903   if (reachable(src1)) {
1904     cmpl(as_Address(src1), imm);
1905   } else {
1906     lea(rscratch1, src1);
1907     cmpl(Address(rscratch1, 0), imm);
1908   }
1909 }
1910 
1911 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1912   assert(!src2.is_lval(), "use cmpptr");
1913   if (reachable(src2)) {
1914     cmpl(src1, as_Address(src2));
1915   } else {
1916     lea(rscratch1, src2);
1917     cmpl(src1, Address(rscratch1, 0));
1918   }
1919 }
1920 
1921 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1922   Assembler::cmpl(src1, imm);
1923 }
1924 
1925 void MacroAssembler::cmp32(Register src1, Address src2) {
1926   Assembler::cmpl(src1, src2);
1927 }
1928 
1929 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1930   ucomisd(opr1, opr2);
1931 
1932   Label L;
1933   if (unordered_is_less) {
1934     movl(dst, -1);
1935     jcc(Assembler::parity, L);
1936     jcc(Assembler::below , L);
1937     movl(dst, 0);
1938     jcc(Assembler::equal , L);
1939     increment(dst);
1940   } else { // unordered is greater
1941     movl(dst, 1);
1942     jcc(Assembler::parity, L);
1943     jcc(Assembler::above , L);
1944     movl(dst, 0);
1945     jcc(Assembler::equal , L);
1946     decrementl(dst);
1947   }
1948   bind(L);
1949 }
1950 
1951 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1952   ucomiss(opr1, opr2);
1953 
1954   Label L;
1955   if (unordered_is_less) {
1956     movl(dst, -1);
1957     jcc(Assembler::parity, L);
1958     jcc(Assembler::below , L);
1959     movl(dst, 0);
1960     jcc(Assembler::equal , L);
1961     increment(dst);
1962   } else { // unordered is greater
1963     movl(dst, 1);
1964     jcc(Assembler::parity, L);
1965     jcc(Assembler::above , L);
1966     movl(dst, 0);
1967     jcc(Assembler::equal , L);
1968     decrementl(dst);
1969   }
1970   bind(L);
1971 }
1972 
1973 
1974 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1975   if (reachable(src1)) {
1976     cmpb(as_Address(src1), imm);
1977   } else {
1978     lea(rscratch1, src1);
1979     cmpb(Address(rscratch1, 0), imm);
1980   }
1981 }
1982 
1983 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1984 #ifdef _LP64
1985   if (src2.is_lval()) {
1986     movptr(rscratch1, src2);
1987     Assembler::cmpq(src1, rscratch1);
1988   } else if (reachable(src2)) {
1989     cmpq(src1, as_Address(src2));
1990   } else {
1991     lea(rscratch1, src2);
1992     Assembler::cmpq(src1, Address(rscratch1, 0));
1993   }
1994 #else
1995   if (src2.is_lval()) {
1996     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1997   } else {
1998     cmpl(src1, as_Address(src2));
1999   }
2000 #endif // _LP64
2001 }
2002 
2003 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2004   assert(src2.is_lval(), "not a mem-mem compare");
2005 #ifdef _LP64
2006   // moves src2's literal address
2007   movptr(rscratch1, src2);
2008   Assembler::cmpq(src1, rscratch1);
2009 #else
2010   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2011 #endif // _LP64
2012 }
2013 
2014 void MacroAssembler::cmpoop(Register src1, Register src2) {
2015   cmpptr(src1, src2);
2016 }
2017 
2018 void MacroAssembler::cmpoop(Register src1, Address src2) {
2019   cmpptr(src1, src2);
2020 }
2021 
2022 #ifdef _LP64
2023 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2024   movoop(rscratch1, src2);
2025   cmpptr(src1, rscratch1);
2026 }
2027 #endif
2028 
2029 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2030   if (reachable(adr)) {
2031     lock();
2032     cmpxchgptr(reg, as_Address(adr));
2033   } else {
2034     lea(rscratch1, adr);
2035     lock();
2036     cmpxchgptr(reg, Address(rscratch1, 0));
2037   }
2038 }
2039 
2040 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2041   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2042 }
2043 
2044 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2045   if (reachable(src)) {
2046     Assembler::comisd(dst, as_Address(src));
2047   } else {
2048     lea(rscratch1, src);
2049     Assembler::comisd(dst, Address(rscratch1, 0));
2050   }
2051 }
2052 
2053 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2054   if (reachable(src)) {
2055     Assembler::comiss(dst, as_Address(src));
2056   } else {
2057     lea(rscratch1, src);
2058     Assembler::comiss(dst, Address(rscratch1, 0));
2059   }
2060 }
2061 
2062 
2063 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2064   Condition negated_cond = negate_condition(cond);
2065   Label L;
2066   jcc(negated_cond, L);
2067   pushf(); // Preserve flags
2068   atomic_incl(counter_addr);
2069   popf();
2070   bind(L);
2071 }
2072 
2073 int MacroAssembler::corrected_idivl(Register reg) {
2074   // Full implementation of Java idiv and irem; checks for
2075   // special case as described in JVM spec., p.243 & p.271.
2076   // The function returns the (pc) offset of the idivl
2077   // instruction - may be needed for implicit exceptions.
2078   //
2079   //         normal case                           special case
2080   //
2081   // input : rax,: dividend                         min_int
2082   //         reg: divisor   (may not be rax,/rdx)   -1
2083   //
2084   // output: rax,: quotient  (= rax, idiv reg)       min_int
2085   //         rdx: remainder (= rax, irem reg)       0
2086   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2087   const int min_int = 0x80000000;
2088   Label normal_case, special_case;
2089 
2090   // check for special case
2091   cmpl(rax, min_int);
2092   jcc(Assembler::notEqual, normal_case);
2093   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2094   cmpl(reg, -1);
2095   jcc(Assembler::equal, special_case);
2096 
2097   // handle normal case
2098   bind(normal_case);
2099   cdql();
2100   int idivl_offset = offset();
2101   idivl(reg);
2102 
2103   // normal and special case exit
2104   bind(special_case);
2105 
2106   return idivl_offset;
2107 }
2108 
2109 
2110 
2111 void MacroAssembler::decrementl(Register reg, int value) {
2112   if (value == min_jint) {subl(reg, value) ; return; }
2113   if (value <  0) { incrementl(reg, -value); return; }
2114   if (value == 0) {                        ; return; }
2115   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2116   /* else */      { subl(reg, value)       ; return; }
2117 }
2118 
2119 void MacroAssembler::decrementl(Address dst, int value) {
2120   if (value == min_jint) {subl(dst, value) ; return; }
2121   if (value <  0) { incrementl(dst, -value); return; }
2122   if (value == 0) {                        ; return; }
2123   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2124   /* else */      { subl(dst, value)       ; return; }
2125 }
2126 
2127 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2128   assert (shift_value > 0, "illegal shift value");
2129   Label _is_positive;
2130   testl (reg, reg);
2131   jcc (Assembler::positive, _is_positive);
2132   int offset = (1 << shift_value) - 1 ;
2133 
2134   if (offset == 1) {
2135     incrementl(reg);
2136   } else {
2137     addl(reg, offset);
2138   }
2139 
2140   bind (_is_positive);
2141   sarl(reg, shift_value);
2142 }
2143 
2144 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2145   if (reachable(src)) {
2146     Assembler::divsd(dst, as_Address(src));
2147   } else {
2148     lea(rscratch1, src);
2149     Assembler::divsd(dst, Address(rscratch1, 0));
2150   }
2151 }
2152 
2153 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2154   if (reachable(src)) {
2155     Assembler::divss(dst, as_Address(src));
2156   } else {
2157     lea(rscratch1, src);
2158     Assembler::divss(dst, Address(rscratch1, 0));
2159   }
2160 }
2161 
2162 void MacroAssembler::enter() {
2163   push(rbp);
2164   mov(rbp, rsp);
2165 }
2166 
2167 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2168 void MacroAssembler::fat_nop() {
2169   if (UseAddressNop) {
2170     addr_nop_5();
2171   } else {
2172     emit_int8(0x26); // es:
2173     emit_int8(0x2e); // cs:
2174     emit_int8(0x64); // fs:
2175     emit_int8(0x65); // gs:
2176     emit_int8((unsigned char)0x90);
2177   }
2178 }
2179 
2180 #ifndef _LP64
2181 void MacroAssembler::fcmp(Register tmp) {
2182   fcmp(tmp, 1, true, true);
2183 }
2184 
2185 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2186   assert(!pop_right || pop_left, "usage error");
2187   if (VM_Version::supports_cmov()) {
2188     assert(tmp == noreg, "unneeded temp");
2189     if (pop_left) {
2190       fucomip(index);
2191     } else {
2192       fucomi(index);
2193     }
2194     if (pop_right) {
2195       fpop();
2196     }
2197   } else {
2198     assert(tmp != noreg, "need temp");
2199     if (pop_left) {
2200       if (pop_right) {
2201         fcompp();
2202       } else {
2203         fcomp(index);
2204       }
2205     } else {
2206       fcom(index);
2207     }
2208     // convert FPU condition into eflags condition via rax,
2209     save_rax(tmp);
2210     fwait(); fnstsw_ax();
2211     sahf();
2212     restore_rax(tmp);
2213   }
2214   // condition codes set as follows:
2215   //
2216   // CF (corresponds to C0) if x < y
2217   // PF (corresponds to C2) if unordered
2218   // ZF (corresponds to C3) if x = y
2219 }
2220 
2221 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2222   fcmp2int(dst, unordered_is_less, 1, true, true);
2223 }
2224 
2225 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2226   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2227   Label L;
2228   if (unordered_is_less) {
2229     movl(dst, -1);
2230     jcc(Assembler::parity, L);
2231     jcc(Assembler::below , L);
2232     movl(dst, 0);
2233     jcc(Assembler::equal , L);
2234     increment(dst);
2235   } else { // unordered is greater
2236     movl(dst, 1);
2237     jcc(Assembler::parity, L);
2238     jcc(Assembler::above , L);
2239     movl(dst, 0);
2240     jcc(Assembler::equal , L);
2241     decrementl(dst);
2242   }
2243   bind(L);
2244 }
2245 
2246 void MacroAssembler::fld_d(AddressLiteral src) {
2247   fld_d(as_Address(src));
2248 }
2249 
2250 void MacroAssembler::fld_s(AddressLiteral src) {
2251   fld_s(as_Address(src));
2252 }
2253 
2254 void MacroAssembler::fldcw(AddressLiteral src) {
2255   Assembler::fldcw(as_Address(src));
2256 }
2257 
2258 void MacroAssembler::fpop() {
2259   ffree();
2260   fincstp();
2261 }
2262 
2263 void MacroAssembler::fremr(Register tmp) {
2264   save_rax(tmp);
2265   { Label L;
2266     bind(L);
2267     fprem();
2268     fwait(); fnstsw_ax();
2269     sahf();
2270     jcc(Assembler::parity, L);
2271   }
2272   restore_rax(tmp);
2273   // Result is in ST0.
2274   // Note: fxch & fpop to get rid of ST1
2275   // (otherwise FPU stack could overflow eventually)
2276   fxch(1);
2277   fpop();
2278 }
2279 
2280 void MacroAssembler::empty_FPU_stack() {
2281   if (VM_Version::supports_mmx()) {
2282     emms();
2283   } else {
2284     for (int i = 8; i-- > 0; ) ffree(i);
2285   }
2286 }
2287 #endif // !LP64
2288 
2289 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2290   if (reachable(src)) {
2291     Assembler::mulpd(dst, as_Address(src));
2292   } else {
2293     lea(rscratch1, src);
2294     Assembler::mulpd(dst, Address(rscratch1, 0));
2295   }
2296 }
2297 
2298 void MacroAssembler::load_float(Address src) {
2299 #ifdef _LP64
2300   movflt(xmm0, src);
2301 #else
2302   if (UseSSE >= 1) {
2303     movflt(xmm0, src);
2304   } else {
2305     fld_s(src);
2306   }
2307 #endif // LP64
2308 }
2309 
2310 void MacroAssembler::store_float(Address dst) {
2311 #ifdef _LP64
2312   movflt(dst, xmm0);
2313 #else
2314   if (UseSSE >= 1) {
2315     movflt(dst, xmm0);
2316   } else {
2317     fstp_s(dst);
2318   }
2319 #endif // LP64
2320 }
2321 
2322 void MacroAssembler::load_double(Address src) {
2323 #ifdef _LP64
2324   movdbl(xmm0, src);
2325 #else
2326   if (UseSSE >= 2) {
2327     movdbl(xmm0, src);
2328   } else {
2329     fld_d(src);
2330   }
2331 #endif // LP64
2332 }
2333 
2334 void MacroAssembler::store_double(Address dst) {
2335 #ifdef _LP64
2336   movdbl(dst, xmm0);
2337 #else
2338   if (UseSSE >= 2) {
2339     movdbl(dst, xmm0);
2340   } else {
2341     fstp_d(dst);
2342   }
2343 #endif // LP64
2344 }
2345 
2346 // dst = c = a * b + c
2347 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2348   Assembler::vfmadd231sd(c, a, b);
2349   if (dst != c) {
2350     movdbl(dst, c);
2351   }
2352 }
2353 
2354 // dst = c = a * b + c
2355 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2356   Assembler::vfmadd231ss(c, a, b);
2357   if (dst != c) {
2358     movflt(dst, c);
2359   }
2360 }
2361 
2362 // dst = c = a * b + c
2363 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2364   Assembler::vfmadd231pd(c, a, b, vector_len);
2365   if (dst != c) {
2366     vmovdqu(dst, c);
2367   }
2368 }
2369 
2370 // dst = c = a * b + c
2371 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2372   Assembler::vfmadd231ps(c, a, b, vector_len);
2373   if (dst != c) {
2374     vmovdqu(dst, c);
2375   }
2376 }
2377 
2378 // dst = c = a * b + c
2379 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2380   Assembler::vfmadd231pd(c, a, b, vector_len);
2381   if (dst != c) {
2382     vmovdqu(dst, c);
2383   }
2384 }
2385 
2386 // dst = c = a * b + c
2387 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2388   Assembler::vfmadd231ps(c, a, b, vector_len);
2389   if (dst != c) {
2390     vmovdqu(dst, c);
2391   }
2392 }
2393 
2394 void MacroAssembler::incrementl(AddressLiteral dst) {
2395   if (reachable(dst)) {
2396     incrementl(as_Address(dst));
2397   } else {
2398     lea(rscratch1, dst);
2399     incrementl(Address(rscratch1, 0));
2400   }
2401 }
2402 
2403 void MacroAssembler::incrementl(ArrayAddress dst) {
2404   incrementl(as_Address(dst));
2405 }
2406 
2407 void MacroAssembler::incrementl(Register reg, int value) {
2408   if (value == min_jint) {addl(reg, value) ; return; }
2409   if (value <  0) { decrementl(reg, -value); return; }
2410   if (value == 0) {                        ; return; }
2411   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2412   /* else */      { addl(reg, value)       ; return; }
2413 }
2414 
2415 void MacroAssembler::incrementl(Address dst, int value) {
2416   if (value == min_jint) {addl(dst, value) ; return; }
2417   if (value <  0) { decrementl(dst, -value); return; }
2418   if (value == 0) {                        ; return; }
2419   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2420   /* else */      { addl(dst, value)       ; return; }
2421 }
2422 
2423 void MacroAssembler::jump(AddressLiteral dst) {
2424   if (reachable(dst)) {
2425     jmp_literal(dst.target(), dst.rspec());
2426   } else {
2427     lea(rscratch1, dst);
2428     jmp(rscratch1);
2429   }
2430 }
2431 
2432 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2433   if (reachable(dst)) {
2434     InstructionMark im(this);
2435     relocate(dst.reloc());
2436     const int short_size = 2;
2437     const int long_size = 6;
2438     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2439     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2440       // 0111 tttn #8-bit disp
2441       emit_int8(0x70 | cc);
2442       emit_int8((offs - short_size) & 0xFF);
2443     } else {
2444       // 0000 1111 1000 tttn #32-bit disp
2445       emit_int8(0x0F);
2446       emit_int8((unsigned char)(0x80 | cc));
2447       emit_int32(offs - long_size);
2448     }
2449   } else {
2450 #ifdef ASSERT
2451     warning("reversing conditional branch");
2452 #endif /* ASSERT */
2453     Label skip;
2454     jccb(reverse[cc], skip);
2455     lea(rscratch1, dst);
2456     Assembler::jmp(rscratch1);
2457     bind(skip);
2458   }
2459 }
2460 
2461 void MacroAssembler::fld_x(AddressLiteral src) {
2462   Assembler::fld_x(as_Address(src));
2463 }
2464 
2465 void MacroAssembler::ldmxcsr(AddressLiteral src) {
2466   if (reachable(src)) {
2467     Assembler::ldmxcsr(as_Address(src));
2468   } else {
2469     lea(rscratch1, src);
2470     Assembler::ldmxcsr(Address(rscratch1, 0));
2471   }
2472 }
2473 
2474 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2475   int off;
2476   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2477     off = offset();
2478     movsbl(dst, src); // movsxb
2479   } else {
2480     off = load_unsigned_byte(dst, src);
2481     shll(dst, 24);
2482     sarl(dst, 24);
2483   }
2484   return off;
2485 }
2486 
2487 // Note: load_signed_short used to be called load_signed_word.
2488 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2489 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2490 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2491 int MacroAssembler::load_signed_short(Register dst, Address src) {
2492   int off;
2493   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2494     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2495     // version but this is what 64bit has always done. This seems to imply
2496     // that users are only using 32bits worth.
2497     off = offset();
2498     movswl(dst, src); // movsxw
2499   } else {
2500     off = load_unsigned_short(dst, src);
2501     shll(dst, 16);
2502     sarl(dst, 16);
2503   }
2504   return off;
2505 }
2506 
2507 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2508   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2509   // and "3.9 Partial Register Penalties", p. 22).
2510   int off;
2511   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2512     off = offset();
2513     movzbl(dst, src); // movzxb
2514   } else {
2515     xorl(dst, dst);
2516     off = offset();
2517     movb(dst, src);
2518   }
2519   return off;
2520 }
2521 
2522 // Note: load_unsigned_short used to be called load_unsigned_word.
2523 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2524   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2525   // and "3.9 Partial Register Penalties", p. 22).
2526   int off;
2527   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2528     off = offset();
2529     movzwl(dst, src); // movzxw
2530   } else {
2531     xorl(dst, dst);
2532     off = offset();
2533     movw(dst, src);
2534   }
2535   return off;
2536 }
2537 
2538 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2539   switch (size_in_bytes) {
2540 #ifndef _LP64
2541   case  8:
2542     assert(dst2 != noreg, "second dest register required");
2543     movl(dst,  src);
2544     movl(dst2, src.plus_disp(BytesPerInt));
2545     break;
2546 #else
2547   case  8:  movq(dst, src); break;
2548 #endif
2549   case  4:  movl(dst, src); break;
2550   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2551   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2552   default:  ShouldNotReachHere();
2553   }
2554 }
2555 
2556 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2557   switch (size_in_bytes) {
2558 #ifndef _LP64
2559   case  8:
2560     assert(src2 != noreg, "second source register required");
2561     movl(dst,                        src);
2562     movl(dst.plus_disp(BytesPerInt), src2);
2563     break;
2564 #else
2565   case  8:  movq(dst, src); break;
2566 #endif
2567   case  4:  movl(dst, src); break;
2568   case  2:  movw(dst, src); break;
2569   case  1:  movb(dst, src); break;
2570   default:  ShouldNotReachHere();
2571   }
2572 }
2573 
2574 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2575   if (reachable(dst)) {
2576     movl(as_Address(dst), src);
2577   } else {
2578     lea(rscratch1, dst);
2579     movl(Address(rscratch1, 0), src);
2580   }
2581 }
2582 
2583 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2584   if (reachable(src)) {
2585     movl(dst, as_Address(src));
2586   } else {
2587     lea(rscratch1, src);
2588     movl(dst, Address(rscratch1, 0));
2589   }
2590 }
2591 
2592 // C++ bool manipulation
2593 
2594 void MacroAssembler::movbool(Register dst, Address src) {
2595   if(sizeof(bool) == 1)
2596     movb(dst, src);
2597   else if(sizeof(bool) == 2)
2598     movw(dst, src);
2599   else if(sizeof(bool) == 4)
2600     movl(dst, src);
2601   else
2602     // unsupported
2603     ShouldNotReachHere();
2604 }
2605 
2606 void MacroAssembler::movbool(Address dst, bool boolconst) {
2607   if(sizeof(bool) == 1)
2608     movb(dst, (int) boolconst);
2609   else if(sizeof(bool) == 2)
2610     movw(dst, (int) boolconst);
2611   else if(sizeof(bool) == 4)
2612     movl(dst, (int) boolconst);
2613   else
2614     // unsupported
2615     ShouldNotReachHere();
2616 }
2617 
2618 void MacroAssembler::movbool(Address dst, Register src) {
2619   if(sizeof(bool) == 1)
2620     movb(dst, src);
2621   else if(sizeof(bool) == 2)
2622     movw(dst, src);
2623   else if(sizeof(bool) == 4)
2624     movl(dst, src);
2625   else
2626     // unsupported
2627     ShouldNotReachHere();
2628 }
2629 
2630 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2631   movb(as_Address(dst), src);
2632 }
2633 
2634 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2635   if (reachable(src)) {
2636     movdl(dst, as_Address(src));
2637   } else {
2638     lea(rscratch1, src);
2639     movdl(dst, Address(rscratch1, 0));
2640   }
2641 }
2642 
2643 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2644   if (reachable(src)) {
2645     movq(dst, as_Address(src));
2646   } else {
2647     lea(rscratch1, src);
2648     movq(dst, Address(rscratch1, 0));
2649   }
2650 }
2651 
2652 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2653   if (reachable(src)) {
2654     if (UseXmmLoadAndClearUpper) {
2655       movsd (dst, as_Address(src));
2656     } else {
2657       movlpd(dst, as_Address(src));
2658     }
2659   } else {
2660     lea(rscratch1, src);
2661     if (UseXmmLoadAndClearUpper) {
2662       movsd (dst, Address(rscratch1, 0));
2663     } else {
2664       movlpd(dst, Address(rscratch1, 0));
2665     }
2666   }
2667 }
2668 
2669 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2670   if (reachable(src)) {
2671     movss(dst, as_Address(src));
2672   } else {
2673     lea(rscratch1, src);
2674     movss(dst, Address(rscratch1, 0));
2675   }
2676 }
2677 
2678 void MacroAssembler::movptr(Register dst, Register src) {
2679   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2680 }
2681 
2682 void MacroAssembler::movptr(Register dst, Address src) {
2683   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2684 }
2685 
2686 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2687 void MacroAssembler::movptr(Register dst, intptr_t src) {
2688   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2689 }
2690 
2691 void MacroAssembler::movptr(Address dst, Register src) {
2692   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2693 }
2694 
2695 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2696     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2697     Assembler::movdqu(dst, src);
2698 }
2699 
2700 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2701     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2702     Assembler::movdqu(dst, src);
2703 }
2704 
2705 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2706     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2707     Assembler::movdqu(dst, src);
2708 }
2709 
2710 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2711   if (reachable(src)) {
2712     movdqu(dst, as_Address(src));
2713   } else {
2714     lea(scratchReg, src);
2715     movdqu(dst, Address(scratchReg, 0));
2716   }
2717 }
2718 
2719 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2720     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2721     Assembler::vmovdqu(dst, src);
2722 }
2723 
2724 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2725     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2726     Assembler::vmovdqu(dst, src);
2727 }
2728 
2729 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2730     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2731     Assembler::vmovdqu(dst, src);
2732 }
2733 
2734 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2735   if (reachable(src)) {
2736     vmovdqu(dst, as_Address(src));
2737   }
2738   else {
2739     lea(scratch_reg, src);
2740     vmovdqu(dst, Address(scratch_reg, 0));
2741   }
2742 }
2743 
2744 void MacroAssembler::kmov(KRegister dst, Address src) {
2745   if (VM_Version::supports_avx512bw()) {
2746     kmovql(dst, src);
2747   } else {
2748     assert(VM_Version::supports_evex(), "");
2749     kmovwl(dst, src);
2750   }
2751 }
2752 
2753 void MacroAssembler::kmov(Address dst, KRegister src) {
2754   if (VM_Version::supports_avx512bw()) {
2755     kmovql(dst, src);
2756   } else {
2757     assert(VM_Version::supports_evex(), "");
2758     kmovwl(dst, src);
2759   }
2760 }
2761 
2762 void MacroAssembler::kmov(KRegister dst, KRegister src) {
2763   if (VM_Version::supports_avx512bw()) {
2764     kmovql(dst, src);
2765   } else {
2766     assert(VM_Version::supports_evex(), "");
2767     kmovwl(dst, src);
2768   }
2769 }
2770 
2771 void MacroAssembler::kmov(Register dst, KRegister src) {
2772   if (VM_Version::supports_avx512bw()) {
2773     kmovql(dst, src);
2774   } else {
2775     assert(VM_Version::supports_evex(), "");
2776     kmovwl(dst, src);
2777   }
2778 }
2779 
2780 void MacroAssembler::kmov(KRegister dst, Register src) {
2781   if (VM_Version::supports_avx512bw()) {
2782     kmovql(dst, src);
2783   } else {
2784     assert(VM_Version::supports_evex(), "");
2785     kmovwl(dst, src);
2786   }
2787 }
2788 
2789 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) {
2790   if (reachable(src)) {
2791     kmovql(dst, as_Address(src));
2792   } else {
2793     lea(scratch_reg, src);
2794     kmovql(dst, Address(scratch_reg, 0));
2795   }
2796 }
2797 
2798 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2799   if (reachable(src)) {
2800     kmovwl(dst, as_Address(src));
2801   } else {
2802     lea(scratch_reg, src);
2803     kmovwl(dst, Address(scratch_reg, 0));
2804   }
2805 }
2806 
2807 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2808                                int vector_len, Register scratch_reg) {
2809   if (reachable(src)) {
2810     if (mask == k0) {
2811       Assembler::evmovdqub(dst, as_Address(src), merge, vector_len);
2812     } else {
2813       Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2814     }
2815   } else {
2816     lea(scratch_reg, src);
2817     if (mask == k0) {
2818       Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len);
2819     } else {
2820       Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2821     }
2822   }
2823 }
2824 
2825 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2826                                int vector_len, Register scratch_reg) {
2827   if (reachable(src)) {
2828     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2829   } else {
2830     lea(scratch_reg, src);
2831     Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2832   }
2833 }
2834 
2835 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2836                                int vector_len, Register scratch_reg) {
2837   if (reachable(src)) {
2838     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2839   } else {
2840     lea(scratch_reg, src);
2841     Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2842   }
2843 }
2844 
2845 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2846                                int vector_len, Register scratch_reg) {
2847   if (reachable(src)) {
2848     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2849   } else {
2850     lea(scratch_reg, src);
2851     Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2852   }
2853 }
2854 
2855 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2856   if (reachable(src)) {
2857     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2858   } else {
2859     lea(rscratch, src);
2860     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2861   }
2862 }
2863 
2864 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2865   if (reachable(src)) {
2866     Assembler::movdqa(dst, as_Address(src));
2867   } else {
2868     lea(rscratch1, src);
2869     Assembler::movdqa(dst, Address(rscratch1, 0));
2870   }
2871 }
2872 
2873 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2874   if (reachable(src)) {
2875     Assembler::movsd(dst, as_Address(src));
2876   } else {
2877     lea(rscratch1, src);
2878     Assembler::movsd(dst, Address(rscratch1, 0));
2879   }
2880 }
2881 
2882 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2883   if (reachable(src)) {
2884     Assembler::movss(dst, as_Address(src));
2885   } else {
2886     lea(rscratch1, src);
2887     Assembler::movss(dst, Address(rscratch1, 0));
2888   }
2889 }
2890 
2891 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2892   if (reachable(src)) {
2893     Assembler::mulsd(dst, as_Address(src));
2894   } else {
2895     lea(rscratch1, src);
2896     Assembler::mulsd(dst, Address(rscratch1, 0));
2897   }
2898 }
2899 
2900 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2901   if (reachable(src)) {
2902     Assembler::mulss(dst, as_Address(src));
2903   } else {
2904     lea(rscratch1, src);
2905     Assembler::mulss(dst, Address(rscratch1, 0));
2906   }
2907 }
2908 
2909 void MacroAssembler::null_check(Register reg, int offset) {
2910   if (needs_explicit_null_check(offset)) {
2911     // provoke OS NULL exception if reg = NULL by
2912     // accessing M[reg] w/o changing any (non-CC) registers
2913     // NOTE: cmpl is plenty here to provoke a segv
2914     cmpptr(rax, Address(reg, 0));
2915     // Note: should probably use testl(rax, Address(reg, 0));
2916     //       may be shorter code (however, this version of
2917     //       testl needs to be implemented first)
2918   } else {
2919     // nothing to do, (later) access of M[reg + offset]
2920     // will provoke OS NULL exception if reg = NULL
2921   }
2922 }
2923 
2924 void MacroAssembler::os_breakpoint() {
2925   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2926   // (e.g., MSVC can't call ps() otherwise)
2927   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2928 }
2929 
2930 void MacroAssembler::unimplemented(const char* what) {
2931   const char* buf = NULL;
2932   {
2933     ResourceMark rm;
2934     stringStream ss;
2935     ss.print("unimplemented: %s", what);
2936     buf = code_string(ss.as_string());
2937   }
2938   stop(buf);
2939 }
2940 
2941 #ifdef _LP64
2942 #define XSTATE_BV 0x200
2943 #endif
2944 
2945 void MacroAssembler::pop_CPU_state() {
2946   pop_FPU_state();
2947   pop_IU_state();
2948 }
2949 
2950 void MacroAssembler::pop_FPU_state() {
2951 #ifndef _LP64
2952   frstor(Address(rsp, 0));
2953 #else
2954   fxrstor(Address(rsp, 0));
2955 #endif
2956   addptr(rsp, FPUStateSizeInWords * wordSize);
2957 }
2958 
2959 void MacroAssembler::pop_IU_state() {
2960   popa();
2961   LP64_ONLY(addq(rsp, 8));
2962   popf();
2963 }
2964 
2965 // Save Integer and Float state
2966 // Warning: Stack must be 16 byte aligned (64bit)
2967 void MacroAssembler::push_CPU_state() {
2968   push_IU_state();
2969   push_FPU_state();
2970 }
2971 
2972 void MacroAssembler::push_FPU_state() {
2973   subptr(rsp, FPUStateSizeInWords * wordSize);
2974 #ifndef _LP64
2975   fnsave(Address(rsp, 0));
2976   fwait();
2977 #else
2978   fxsave(Address(rsp, 0));
2979 #endif // LP64
2980 }
2981 
2982 void MacroAssembler::push_IU_state() {
2983   // Push flags first because pusha kills them
2984   pushf();
2985   // Make sure rsp stays 16-byte aligned
2986   LP64_ONLY(subq(rsp, 8));
2987   pusha();
2988 }
2989 
2990 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2991   if (!java_thread->is_valid()) {
2992     java_thread = rdi;
2993     get_thread(java_thread);
2994   }
2995   // we must set sp to zero to clear frame
2996   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2997   // must clear fp, so that compiled frames are not confused; it is
2998   // possible that we need it only for debugging
2999   if (clear_fp) {
3000     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3001   }
3002   // Always clear the pc because it could have been set by make_walkable()
3003   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3004   vzeroupper();
3005 }
3006 
3007 void MacroAssembler::restore_rax(Register tmp) {
3008   if (tmp == noreg) pop(rax);
3009   else if (tmp != rax) mov(rax, tmp);
3010 }
3011 
3012 void MacroAssembler::round_to(Register reg, int modulus) {
3013   addptr(reg, modulus - 1);
3014   andptr(reg, -modulus);
3015 }
3016 
3017 void MacroAssembler::save_rax(Register tmp) {
3018   if (tmp == noreg) push(rax);
3019   else if (tmp != rax) mov(tmp, rax);
3020 }
3021 
3022 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
3023   if (at_return) {
3024     // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
3025     // we may safely use rsp instead to perform the stack watermark check.
3026     cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
3027     jcc(Assembler::above, slow_path);
3028     return;
3029   }
3030   testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
3031   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3032 }
3033 
3034 // Calls to C land
3035 //
3036 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3037 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3038 // has to be reset to 0. This is required to allow proper stack traversal.
3039 void MacroAssembler::set_last_Java_frame(Register java_thread,
3040                                          Register last_java_sp,
3041                                          Register last_java_fp,
3042                                          address  last_java_pc) {
3043   vzeroupper();
3044   // determine java_thread register
3045   if (!java_thread->is_valid()) {
3046     java_thread = rdi;
3047     get_thread(java_thread);
3048   }
3049   // determine last_java_sp register
3050   if (!last_java_sp->is_valid()) {
3051     last_java_sp = rsp;
3052   }
3053 
3054   // last_java_fp is optional
3055 
3056   if (last_java_fp->is_valid()) {
3057     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3058   }
3059 
3060   // last_java_pc is optional
3061 
3062   if (last_java_pc != NULL) {
3063     lea(Address(java_thread,
3064                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3065         InternalAddress(last_java_pc));
3066 
3067   }
3068   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3069 }
3070 
3071 void MacroAssembler::shlptr(Register dst, int imm8) {
3072   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3073 }
3074 
3075 void MacroAssembler::shrptr(Register dst, int imm8) {
3076   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3077 }
3078 
3079 void MacroAssembler::sign_extend_byte(Register reg) {
3080   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3081     movsbl(reg, reg); // movsxb
3082   } else {
3083     shll(reg, 24);
3084     sarl(reg, 24);
3085   }
3086 }
3087 
3088 void MacroAssembler::sign_extend_short(Register reg) {
3089   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3090     movswl(reg, reg); // movsxw
3091   } else {
3092     shll(reg, 16);
3093     sarl(reg, 16);
3094   }
3095 }
3096 
3097 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3098   assert(reachable(src), "Address should be reachable");
3099   testl(dst, as_Address(src));
3100 }
3101 
3102 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3103   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3104   Assembler::pcmpeqb(dst, src);
3105 }
3106 
3107 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3108   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3109   Assembler::pcmpeqw(dst, src);
3110 }
3111 
3112 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3113   assert((dst->encoding() < 16),"XMM register should be 0-15");
3114   Assembler::pcmpestri(dst, src, imm8);
3115 }
3116 
3117 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3118   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3119   Assembler::pcmpestri(dst, src, imm8);
3120 }
3121 
3122 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3123   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3124   Assembler::pmovzxbw(dst, src);
3125 }
3126 
3127 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3128   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3129   Assembler::pmovzxbw(dst, src);
3130 }
3131 
3132 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3133   assert((src->encoding() < 16),"XMM register should be 0-15");
3134   Assembler::pmovmskb(dst, src);
3135 }
3136 
3137 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3138   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3139   Assembler::ptest(dst, src);
3140 }
3141 
3142 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3143   if (reachable(src)) {
3144     Assembler::sqrtsd(dst, as_Address(src));
3145   } else {
3146     lea(rscratch1, src);
3147     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3148   }
3149 }
3150 
3151 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3152   if (reachable(src)) {
3153     Assembler::sqrtss(dst, as_Address(src));
3154   } else {
3155     lea(rscratch1, src);
3156     Assembler::sqrtss(dst, Address(rscratch1, 0));
3157   }
3158 }
3159 
3160 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3161   if (reachable(src)) {
3162     Assembler::subsd(dst, as_Address(src));
3163   } else {
3164     lea(rscratch1, src);
3165     Assembler::subsd(dst, Address(rscratch1, 0));
3166   }
3167 }
3168 
3169 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
3170   if (reachable(src)) {
3171     Assembler::roundsd(dst, as_Address(src), rmode);
3172   } else {
3173     lea(scratch_reg, src);
3174     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
3175   }
3176 }
3177 
3178 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3179   if (reachable(src)) {
3180     Assembler::subss(dst, as_Address(src));
3181   } else {
3182     lea(rscratch1, src);
3183     Assembler::subss(dst, Address(rscratch1, 0));
3184   }
3185 }
3186 
3187 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3188   if (reachable(src)) {
3189     Assembler::ucomisd(dst, as_Address(src));
3190   } else {
3191     lea(rscratch1, src);
3192     Assembler::ucomisd(dst, Address(rscratch1, 0));
3193   }
3194 }
3195 
3196 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3197   if (reachable(src)) {
3198     Assembler::ucomiss(dst, as_Address(src));
3199   } else {
3200     lea(rscratch1, src);
3201     Assembler::ucomiss(dst, Address(rscratch1, 0));
3202   }
3203 }
3204 
3205 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3206   // Used in sign-bit flipping with aligned address.
3207   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3208   if (reachable(src)) {
3209     Assembler::xorpd(dst, as_Address(src));
3210   } else {
3211     lea(scratch_reg, src);
3212     Assembler::xorpd(dst, Address(scratch_reg, 0));
3213   }
3214 }
3215 
3216 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3217   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3218     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3219   }
3220   else {
3221     Assembler::xorpd(dst, src);
3222   }
3223 }
3224 
3225 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3226   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3227     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3228   } else {
3229     Assembler::xorps(dst, src);
3230   }
3231 }
3232 
3233 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3234   // Used in sign-bit flipping with aligned address.
3235   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3236   if (reachable(src)) {
3237     Assembler::xorps(dst, as_Address(src));
3238   } else {
3239     lea(scratch_reg, src);
3240     Assembler::xorps(dst, Address(scratch_reg, 0));
3241   }
3242 }
3243 
3244 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3245   // Used in sign-bit flipping with aligned address.
3246   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3247   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3248   if (reachable(src)) {
3249     Assembler::pshufb(dst, as_Address(src));
3250   } else {
3251     lea(rscratch1, src);
3252     Assembler::pshufb(dst, Address(rscratch1, 0));
3253   }
3254 }
3255 
3256 // AVX 3-operands instructions
3257 
3258 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3259   if (reachable(src)) {
3260     vaddsd(dst, nds, as_Address(src));
3261   } else {
3262     lea(rscratch1, src);
3263     vaddsd(dst, nds, Address(rscratch1, 0));
3264   }
3265 }
3266 
3267 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3268   if (reachable(src)) {
3269     vaddss(dst, nds, as_Address(src));
3270   } else {
3271     lea(rscratch1, src);
3272     vaddss(dst, nds, Address(rscratch1, 0));
3273   }
3274 }
3275 
3276 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3277   assert(UseAVX > 0, "requires some form of AVX");
3278   if (reachable(src)) {
3279     Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3280   } else {
3281     lea(rscratch, src);
3282     Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3283   }
3284 }
3285 
3286 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3287   assert(UseAVX > 0, "requires some form of AVX");
3288   if (reachable(src)) {
3289     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3290   } else {
3291     lea(rscratch, src);
3292     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3293   }
3294 }
3295 
3296 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3297   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3298   vandps(dst, nds, negate_field, vector_len);
3299 }
3300 
3301 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3302   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3303   vandpd(dst, nds, negate_field, vector_len);
3304 }
3305 
3306 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3307   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3308   Assembler::vpaddb(dst, nds, src, vector_len);
3309 }
3310 
3311 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3312   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3313   Assembler::vpaddb(dst, nds, src, vector_len);
3314 }
3315 
3316 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3317   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3318   Assembler::vpaddw(dst, nds, src, vector_len);
3319 }
3320 
3321 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3322   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3323   Assembler::vpaddw(dst, nds, src, vector_len);
3324 }
3325 
3326 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3327   if (reachable(src)) {
3328     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3329   } else {
3330     lea(scratch_reg, src);
3331     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3332   }
3333 }
3334 
3335 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3336   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3337   Assembler::vpbroadcastw(dst, src, vector_len);
3338 }
3339 
3340 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3341   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3342   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3343 }
3344 
3345 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3346   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3347   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3348 }
3349 
3350 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3351                                AddressLiteral src, int vector_len, Register scratch_reg) {
3352   if (reachable(src)) {
3353     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3354   } else {
3355     lea(scratch_reg, src);
3356     Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3357   }
3358 }
3359 
3360 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3361                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3362   if (reachable(src)) {
3363     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3364   } else {
3365     lea(scratch_reg, src);
3366     Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3367   }
3368 }
3369 
3370 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3371                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3372   if (reachable(src)) {
3373     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3374   } else {
3375     lea(scratch_reg, src);
3376     Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3377   }
3378 }
3379 
3380 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3381                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3382   if (reachable(src)) {
3383     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3384   } else {
3385     lea(scratch_reg, src);
3386     Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3387   }
3388 }
3389 
3390 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3391                              int comparison, bool is_signed, int vector_len, Register scratch_reg) {
3392   if (reachable(src)) {
3393     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3394   } else {
3395     lea(scratch_reg, src);
3396     Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, is_signed, vector_len);
3397   }
3398 }
3399 
3400 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3401   if (width == Assembler::Q) {
3402     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3403   } else {
3404     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3405   }
3406 }
3407 
3408 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg) {
3409   int eq_cond_enc = 0x29;
3410   int gt_cond_enc = 0x37;
3411   if (width != Assembler::Q) {
3412     eq_cond_enc = 0x74 + width;
3413     gt_cond_enc = 0x64 + width;
3414   }
3415   switch (cond) {
3416   case eq:
3417     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3418     break;
3419   case neq:
3420     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3421     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3422     break;
3423   case le:
3424     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3425     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3426     break;
3427   case nlt:
3428     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3429     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3430     break;
3431   case lt:
3432     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3433     break;
3434   case nle:
3435     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3436     break;
3437   default:
3438     assert(false, "Should not reach here");
3439   }
3440 }
3441 
3442 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3443   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3444   Assembler::vpmovzxbw(dst, src, vector_len);
3445 }
3446 
3447 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3448   assert((src->encoding() < 16),"XMM register should be 0-15");
3449   Assembler::vpmovmskb(dst, src, vector_len);
3450 }
3451 
3452 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3453   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3454   Assembler::vpmullw(dst, nds, src, vector_len);
3455 }
3456 
3457 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3458   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3459   Assembler::vpmullw(dst, nds, src, vector_len);
3460 }
3461 
3462 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3463   assert((UseAVX > 0), "AVX support is needed");
3464   if (reachable(src)) {
3465     Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3466   } else {
3467     lea(scratch_reg, src);
3468     Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3469   }
3470 }
3471 
3472 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3473   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3474   Assembler::vpsubb(dst, nds, src, vector_len);
3475 }
3476 
3477 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3478   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3479   Assembler::vpsubb(dst, nds, src, vector_len);
3480 }
3481 
3482 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3483   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3484   Assembler::vpsubw(dst, nds, src, vector_len);
3485 }
3486 
3487 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3488   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3489   Assembler::vpsubw(dst, nds, src, vector_len);
3490 }
3491 
3492 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3493   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3494   Assembler::vpsraw(dst, nds, shift, vector_len);
3495 }
3496 
3497 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3498   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3499   Assembler::vpsraw(dst, nds, shift, vector_len);
3500 }
3501 
3502 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3503   assert(UseAVX > 2,"");
3504   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3505      vector_len = 2;
3506   }
3507   Assembler::evpsraq(dst, nds, shift, vector_len);
3508 }
3509 
3510 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3511   assert(UseAVX > 2,"");
3512   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3513      vector_len = 2;
3514   }
3515   Assembler::evpsraq(dst, nds, shift, vector_len);
3516 }
3517 
3518 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3519   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3520   Assembler::vpsrlw(dst, nds, shift, vector_len);
3521 }
3522 
3523 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3524   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3525   Assembler::vpsrlw(dst, nds, shift, vector_len);
3526 }
3527 
3528 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3529   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3530   Assembler::vpsllw(dst, nds, shift, vector_len);
3531 }
3532 
3533 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3534   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3535   Assembler::vpsllw(dst, nds, shift, vector_len);
3536 }
3537 
3538 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3539   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3540   Assembler::vptest(dst, src);
3541 }
3542 
3543 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3544   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3545   Assembler::punpcklbw(dst, src);
3546 }
3547 
3548 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3549   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3550   Assembler::pshufd(dst, src, mode);
3551 }
3552 
3553 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3554   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3555   Assembler::pshuflw(dst, src, mode);
3556 }
3557 
3558 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3559   if (reachable(src)) {
3560     vandpd(dst, nds, as_Address(src), vector_len);
3561   } else {
3562     lea(scratch_reg, src);
3563     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3564   }
3565 }
3566 
3567 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3568   if (reachable(src)) {
3569     vandps(dst, nds, as_Address(src), vector_len);
3570   } else {
3571     lea(scratch_reg, src);
3572     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3573   }
3574 }
3575 
3576 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3577                             bool merge, int vector_len, Register scratch_reg) {
3578   if (reachable(src)) {
3579     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3580   } else {
3581     lea(scratch_reg, src);
3582     Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3583   }
3584 }
3585 
3586 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3587   if (reachable(src)) {
3588     vdivsd(dst, nds, as_Address(src));
3589   } else {
3590     lea(rscratch1, src);
3591     vdivsd(dst, nds, Address(rscratch1, 0));
3592   }
3593 }
3594 
3595 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3596   if (reachable(src)) {
3597     vdivss(dst, nds, as_Address(src));
3598   } else {
3599     lea(rscratch1, src);
3600     vdivss(dst, nds, Address(rscratch1, 0));
3601   }
3602 }
3603 
3604 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3605   if (reachable(src)) {
3606     vmulsd(dst, nds, as_Address(src));
3607   } else {
3608     lea(rscratch1, src);
3609     vmulsd(dst, nds, Address(rscratch1, 0));
3610   }
3611 }
3612 
3613 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3614   if (reachable(src)) {
3615     vmulss(dst, nds, as_Address(src));
3616   } else {
3617     lea(rscratch1, src);
3618     vmulss(dst, nds, Address(rscratch1, 0));
3619   }
3620 }
3621 
3622 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3623   if (reachable(src)) {
3624     vsubsd(dst, nds, as_Address(src));
3625   } else {
3626     lea(rscratch1, src);
3627     vsubsd(dst, nds, Address(rscratch1, 0));
3628   }
3629 }
3630 
3631 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3632   if (reachable(src)) {
3633     vsubss(dst, nds, as_Address(src));
3634   } else {
3635     lea(rscratch1, src);
3636     vsubss(dst, nds, Address(rscratch1, 0));
3637   }
3638 }
3639 
3640 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3641   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3642   vxorps(dst, nds, src, Assembler::AVX_128bit);
3643 }
3644 
3645 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3646   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3647   vxorpd(dst, nds, src, Assembler::AVX_128bit);
3648 }
3649 
3650 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3651   if (reachable(src)) {
3652     vxorpd(dst, nds, as_Address(src), vector_len);
3653   } else {
3654     lea(scratch_reg, src);
3655     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3656   }
3657 }
3658 
3659 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3660   if (reachable(src)) {
3661     vxorps(dst, nds, as_Address(src), vector_len);
3662   } else {
3663     lea(scratch_reg, src);
3664     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3665   }
3666 }
3667 
3668 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3669   if (UseAVX > 1 || (vector_len < 1)) {
3670     if (reachable(src)) {
3671       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3672     } else {
3673       lea(scratch_reg, src);
3674       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3675     }
3676   }
3677   else {
3678     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3679   }
3680 }
3681 
3682 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3683   if (reachable(src)) {
3684     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3685   } else {
3686     lea(scratch_reg, src);
3687     Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3688   }
3689 }
3690 
3691 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3692   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3693   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3694   // The inverted mask is sign-extended
3695   andptr(possibly_jweak, inverted_jweak_mask);
3696 }
3697 
3698 void MacroAssembler::resolve_jobject(Register value,
3699                                      Register thread,
3700                                      Register tmp) {
3701   assert_different_registers(value, thread, tmp);
3702   Label done, not_weak;
3703   testptr(value, value);
3704   jcc(Assembler::zero, done);                // Use NULL as-is.
3705   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3706   jcc(Assembler::zero, not_weak);
3707   // Resolve jweak.
3708   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3709                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3710   verify_oop(value);
3711   jmp(done);
3712   bind(not_weak);
3713   // Resolve (untagged) jobject.
3714   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3715   verify_oop(value);
3716   bind(done);
3717 }
3718 
3719 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3720   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3721 }
3722 
3723 // Force generation of a 4 byte immediate value even if it fits into 8bit
3724 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3725   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3726 }
3727 
3728 void MacroAssembler::subptr(Register dst, Register src) {
3729   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3730 }
3731 
3732 // C++ bool manipulation
3733 void MacroAssembler::testbool(Register dst) {
3734   if(sizeof(bool) == 1)
3735     testb(dst, 0xff);
3736   else if(sizeof(bool) == 2) {
3737     // testw implementation needed for two byte bools
3738     ShouldNotReachHere();
3739   } else if(sizeof(bool) == 4)
3740     testl(dst, dst);
3741   else
3742     // unsupported
3743     ShouldNotReachHere();
3744 }
3745 
3746 void MacroAssembler::testptr(Register dst, Register src) {
3747   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3748 }
3749 
3750 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3751 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3752                                    Register var_size_in_bytes,
3753                                    int con_size_in_bytes,
3754                                    Register t1,
3755                                    Register t2,
3756                                    Label& slow_case) {
3757   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3758   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3759 }
3760 
3761 // Defines obj, preserves var_size_in_bytes
3762 void MacroAssembler::eden_allocate(Register thread, Register obj,
3763                                    Register var_size_in_bytes,
3764                                    int con_size_in_bytes,
3765                                    Register t1,
3766                                    Label& slow_case) {
3767   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3768   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3769 }
3770 
3771 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3772 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3773   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3774   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3775   Label done;
3776 
3777   testptr(length_in_bytes, length_in_bytes);
3778   jcc(Assembler::zero, done);
3779 
3780   // initialize topmost word, divide index by 2, check if odd and test if zero
3781   // note: for the remaining code to work, index must be a multiple of BytesPerWord
3782 #ifdef ASSERT
3783   {
3784     Label L;
3785     testptr(length_in_bytes, BytesPerWord - 1);
3786     jcc(Assembler::zero, L);
3787     stop("length must be a multiple of BytesPerWord");
3788     bind(L);
3789   }
3790 #endif
3791   Register index = length_in_bytes;
3792   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
3793   if (UseIncDec) {
3794     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
3795   } else {
3796     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
3797     shrptr(index, 1);
3798   }
3799 #ifndef _LP64
3800   // index could have not been a multiple of 8 (i.e., bit 2 was set)
3801   {
3802     Label even;
3803     // note: if index was a multiple of 8, then it cannot
3804     //       be 0 now otherwise it must have been 0 before
3805     //       => if it is even, we don't need to check for 0 again
3806     jcc(Assembler::carryClear, even);
3807     // clear topmost word (no jump would be needed if conditional assignment worked here)
3808     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3809     // index could be 0 now, must check again
3810     jcc(Assembler::zero, done);
3811     bind(even);
3812   }
3813 #endif // !_LP64
3814   // initialize remaining object fields: index is a multiple of 2 now
3815   {
3816     Label loop;
3817     bind(loop);
3818     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3819     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3820     decrement(index);
3821     jcc(Assembler::notZero, loop);
3822   }
3823 
3824   bind(done);
3825 }
3826 
3827 // Look up the method for a megamorphic invokeinterface call.
3828 // The target method is determined by <intf_klass, itable_index>.
3829 // The receiver klass is in recv_klass.
3830 // On success, the result will be in method_result, and execution falls through.
3831 // On failure, execution transfers to the given label.
3832 void MacroAssembler::lookup_interface_method(Register recv_klass,
3833                                              Register intf_klass,
3834                                              RegisterOrConstant itable_index,
3835                                              Register method_result,
3836                                              Register scan_temp,
3837                                              Label& L_no_such_interface,
3838                                              bool return_method) {
3839   assert_different_registers(recv_klass, intf_klass, scan_temp);
3840   assert_different_registers(method_result, intf_klass, scan_temp);
3841   assert(recv_klass != method_result || !return_method,
3842          "recv_klass can be destroyed when method isn't needed");
3843 
3844   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3845          "caller must use same register for non-constant itable index as for method");
3846 
3847   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3848   int vtable_base = in_bytes(Klass::vtable_start_offset());
3849   int itentry_off = itableMethodEntry::method_offset_in_bytes();
3850   int scan_step   = itableOffsetEntry::size() * wordSize;
3851   int vte_size    = vtableEntry::size_in_bytes();
3852   Address::ScaleFactor times_vte_scale = Address::times_ptr;
3853   assert(vte_size == wordSize, "else adjust times_vte_scale");
3854 
3855   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3856 
3857   // %%% Could store the aligned, prescaled offset in the klassoop.
3858   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3859 
3860   if (return_method) {
3861     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3862     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3863     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3864   }
3865 
3866   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3867   //   if (scan->interface() == intf) {
3868   //     result = (klass + scan->offset() + itable_index);
3869   //   }
3870   // }
3871   Label search, found_method;
3872 
3873   for (int peel = 1; peel >= 0; peel--) {
3874     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3875     cmpptr(intf_klass, method_result);
3876 
3877     if (peel) {
3878       jccb(Assembler::equal, found_method);
3879     } else {
3880       jccb(Assembler::notEqual, search);
3881       // (invert the test to fall through to found_method...)
3882     }
3883 
3884     if (!peel)  break;
3885 
3886     bind(search);
3887 
3888     // Check that the previous entry is non-null.  A null entry means that
3889     // the receiver class doesn't implement the interface, and wasn't the
3890     // same as when the caller was compiled.
3891     testptr(method_result, method_result);
3892     jcc(Assembler::zero, L_no_such_interface);
3893     addptr(scan_temp, scan_step);
3894   }
3895 
3896   bind(found_method);
3897 
3898   if (return_method) {
3899     // Got a hit.
3900     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3901     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3902   }
3903 }
3904 
3905 
3906 // virtual method calling
3907 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3908                                            RegisterOrConstant vtable_index,
3909                                            Register method_result) {
3910   const int base = in_bytes(Klass::vtable_start_offset());
3911   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3912   Address vtable_entry_addr(recv_klass,
3913                             vtable_index, Address::times_ptr,
3914                             base + vtableEntry::method_offset_in_bytes());
3915   movptr(method_result, vtable_entry_addr);
3916 }
3917 
3918 
3919 void MacroAssembler::check_klass_subtype(Register sub_klass,
3920                            Register super_klass,
3921                            Register temp_reg,
3922                            Label& L_success) {
3923   Label L_failure;
3924   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
3925   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3926   bind(L_failure);
3927 }
3928 
3929 
3930 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3931                                                    Register super_klass,
3932                                                    Register temp_reg,
3933                                                    Label* L_success,
3934                                                    Label* L_failure,
3935                                                    Label* L_slow_path,
3936                                         RegisterOrConstant super_check_offset) {
3937   assert_different_registers(sub_klass, super_klass, temp_reg);
3938   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3939   if (super_check_offset.is_register()) {
3940     assert_different_registers(sub_klass, super_klass,
3941                                super_check_offset.as_register());
3942   } else if (must_load_sco) {
3943     assert(temp_reg != noreg, "supply either a temp or a register offset");
3944   }
3945 
3946   Label L_fallthrough;
3947   int label_nulls = 0;
3948   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3949   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3950   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3951   assert(label_nulls <= 1, "at most one NULL in the batch");
3952 
3953   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3954   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3955   Address super_check_offset_addr(super_klass, sco_offset);
3956 
3957   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3958   // range of a jccb.  If this routine grows larger, reconsider at
3959   // least some of these.
3960 #define local_jcc(assembler_cond, label)                                \
3961   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
3962   else                             jcc( assembler_cond, label) /*omit semi*/
3963 
3964   // Hacked jmp, which may only be used just before L_fallthrough.
3965 #define final_jmp(label)                                                \
3966   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3967   else                            jmp(label)                /*omit semi*/
3968 
3969   // If the pointers are equal, we are done (e.g., String[] elements).
3970   // This self-check enables sharing of secondary supertype arrays among
3971   // non-primary types such as array-of-interface.  Otherwise, each such
3972   // type would need its own customized SSA.
3973   // We move this check to the front of the fast path because many
3974   // type checks are in fact trivially successful in this manner,
3975   // so we get a nicely predicted branch right at the start of the check.
3976   cmpptr(sub_klass, super_klass);
3977   local_jcc(Assembler::equal, *L_success);
3978 
3979   // Check the supertype display:
3980   if (must_load_sco) {
3981     // Positive movl does right thing on LP64.
3982     movl(temp_reg, super_check_offset_addr);
3983     super_check_offset = RegisterOrConstant(temp_reg);
3984   }
3985   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3986   cmpptr(super_klass, super_check_addr); // load displayed supertype
3987 
3988   // This check has worked decisively for primary supers.
3989   // Secondary supers are sought in the super_cache ('super_cache_addr').
3990   // (Secondary supers are interfaces and very deeply nested subtypes.)
3991   // This works in the same check above because of a tricky aliasing
3992   // between the super_cache and the primary super display elements.
3993   // (The 'super_check_addr' can address either, as the case requires.)
3994   // Note that the cache is updated below if it does not help us find
3995   // what we need immediately.
3996   // So if it was a primary super, we can just fail immediately.
3997   // Otherwise, it's the slow path for us (no success at this point).
3998 
3999   if (super_check_offset.is_register()) {
4000     local_jcc(Assembler::equal, *L_success);
4001     cmpl(super_check_offset.as_register(), sc_offset);
4002     if (L_failure == &L_fallthrough) {
4003       local_jcc(Assembler::equal, *L_slow_path);
4004     } else {
4005       local_jcc(Assembler::notEqual, *L_failure);
4006       final_jmp(*L_slow_path);
4007     }
4008   } else if (super_check_offset.as_constant() == sc_offset) {
4009     // Need a slow path; fast failure is impossible.
4010     if (L_slow_path == &L_fallthrough) {
4011       local_jcc(Assembler::equal, *L_success);
4012     } else {
4013       local_jcc(Assembler::notEqual, *L_slow_path);
4014       final_jmp(*L_success);
4015     }
4016   } else {
4017     // No slow path; it's a fast decision.
4018     if (L_failure == &L_fallthrough) {
4019       local_jcc(Assembler::equal, *L_success);
4020     } else {
4021       local_jcc(Assembler::notEqual, *L_failure);
4022       final_jmp(*L_success);
4023     }
4024   }
4025 
4026   bind(L_fallthrough);
4027 
4028 #undef local_jcc
4029 #undef final_jmp
4030 }
4031 
4032 
4033 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4034                                                    Register super_klass,
4035                                                    Register temp_reg,
4036                                                    Register temp2_reg,
4037                                                    Label* L_success,
4038                                                    Label* L_failure,
4039                                                    bool set_cond_codes) {
4040   assert_different_registers(sub_klass, super_klass, temp_reg);
4041   if (temp2_reg != noreg)
4042     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4043 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4044 
4045   Label L_fallthrough;
4046   int label_nulls = 0;
4047   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4048   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4049   assert(label_nulls <= 1, "at most one NULL in the batch");
4050 
4051   // a couple of useful fields in sub_klass:
4052   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4053   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4054   Address secondary_supers_addr(sub_klass, ss_offset);
4055   Address super_cache_addr(     sub_klass, sc_offset);
4056 
4057   // Do a linear scan of the secondary super-klass chain.
4058   // This code is rarely used, so simplicity is a virtue here.
4059   // The repne_scan instruction uses fixed registers, which we must spill.
4060   // Don't worry too much about pre-existing connections with the input regs.
4061 
4062   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4063   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4064 
4065   // Get super_klass value into rax (even if it was in rdi or rcx).
4066   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4067   if (super_klass != rax) {
4068     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4069     mov(rax, super_klass);
4070   }
4071   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4072   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4073 
4074 #ifndef PRODUCT
4075   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4076   ExternalAddress pst_counter_addr((address) pst_counter);
4077   NOT_LP64(  incrementl(pst_counter_addr) );
4078   LP64_ONLY( lea(rcx, pst_counter_addr) );
4079   LP64_ONLY( incrementl(Address(rcx, 0)) );
4080 #endif //PRODUCT
4081 
4082   // We will consult the secondary-super array.
4083   movptr(rdi, secondary_supers_addr);
4084   // Load the array length.  (Positive movl does right thing on LP64.)
4085   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4086   // Skip to start of data.
4087   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4088 
4089   // Scan RCX words at [RDI] for an occurrence of RAX.
4090   // Set NZ/Z based on last compare.
4091   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4092   // not change flags (only scas instruction which is repeated sets flags).
4093   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4094 
4095     testptr(rax,rax); // Set Z = 0
4096     repne_scan();
4097 
4098   // Unspill the temp. registers:
4099   if (pushed_rdi)  pop(rdi);
4100   if (pushed_rcx)  pop(rcx);
4101   if (pushed_rax)  pop(rax);
4102 
4103   if (set_cond_codes) {
4104     // Special hack for the AD files:  rdi is guaranteed non-zero.
4105     assert(!pushed_rdi, "rdi must be left non-NULL");
4106     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4107   }
4108 
4109   if (L_failure == &L_fallthrough)
4110         jccb(Assembler::notEqual, *L_failure);
4111   else  jcc(Assembler::notEqual, *L_failure);
4112 
4113   // Success.  Cache the super we found and proceed in triumph.
4114   movptr(super_cache_addr, super_klass);
4115 
4116   if (L_success != &L_fallthrough) {
4117     jmp(*L_success);
4118   }
4119 
4120 #undef IS_A_TEMP
4121 
4122   bind(L_fallthrough);
4123 }
4124 
4125 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4126   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4127 
4128   Label L_fallthrough;
4129   if (L_fast_path == NULL) {
4130     L_fast_path = &L_fallthrough;
4131   } else if (L_slow_path == NULL) {
4132     L_slow_path = &L_fallthrough;
4133   }
4134 
4135   // Fast path check: class is fully initialized
4136   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4137   jcc(Assembler::equal, *L_fast_path);
4138 
4139   // Fast path check: current thread is initializer thread
4140   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4141   if (L_slow_path == &L_fallthrough) {
4142     jcc(Assembler::equal, *L_fast_path);
4143     bind(*L_slow_path);
4144   } else if (L_fast_path == &L_fallthrough) {
4145     jcc(Assembler::notEqual, *L_slow_path);
4146     bind(*L_fast_path);
4147   } else {
4148     Unimplemented();
4149   }
4150 }
4151 
4152 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4153   if (VM_Version::supports_cmov()) {
4154     cmovl(cc, dst, src);
4155   } else {
4156     Label L;
4157     jccb(negate_condition(cc), L);
4158     movl(dst, src);
4159     bind(L);
4160   }
4161 }
4162 
4163 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4164   if (VM_Version::supports_cmov()) {
4165     cmovl(cc, dst, src);
4166   } else {
4167     Label L;
4168     jccb(negate_condition(cc), L);
4169     movl(dst, src);
4170     bind(L);
4171   }
4172 }
4173 
4174 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
4175   if (!VerifyOops) return;
4176 
4177   // Pass register number to verify_oop_subroutine
4178   const char* b = NULL;
4179   {
4180     ResourceMark rm;
4181     stringStream ss;
4182     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
4183     b = code_string(ss.as_string());
4184   }
4185   BLOCK_COMMENT("verify_oop {");
4186 #ifdef _LP64
4187   push(rscratch1);                    // save r10, trashed by movptr()
4188 #endif
4189   push(rax);                          // save rax,
4190   push(reg);                          // pass register argument
4191   ExternalAddress buffer((address) b);
4192   // avoid using pushptr, as it modifies scratch registers
4193   // and our contract is not to modify anything
4194   movptr(rax, buffer.addr());
4195   push(rax);
4196   // call indirectly to solve generation ordering problem
4197   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4198   call(rax);
4199   // Caller pops the arguments (oop, message) and restores rax, r10
4200   BLOCK_COMMENT("} verify_oop");
4201 }
4202 
4203 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
4204   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
4205     vpternlogd(dst, 0xFF, dst, dst, vector_len);
4206   } else {
4207     assert(UseAVX > 0, "");
4208     vpcmpeqb(dst, dst, dst, vector_len);
4209   }
4210 }
4211 
4212 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4213                                          int extra_slot_offset) {
4214   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4215   int stackElementSize = Interpreter::stackElementSize;
4216   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4217 #ifdef ASSERT
4218   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4219   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4220 #endif
4221   Register             scale_reg    = noreg;
4222   Address::ScaleFactor scale_factor = Address::no_scale;
4223   if (arg_slot.is_constant()) {
4224     offset += arg_slot.as_constant() * stackElementSize;
4225   } else {
4226     scale_reg    = arg_slot.as_register();
4227     scale_factor = Address::times(stackElementSize);
4228   }
4229   offset += wordSize;           // return PC is on stack
4230   return Address(rsp, scale_reg, scale_factor, offset);
4231 }
4232 
4233 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4234   if (!VerifyOops) return;
4235 
4236   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4237   // Pass register number to verify_oop_subroutine
4238   const char* b = NULL;
4239   {
4240     ResourceMark rm;
4241     stringStream ss;
4242     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4243     b = code_string(ss.as_string());
4244   }
4245 #ifdef _LP64
4246   push(rscratch1);                    // save r10, trashed by movptr()
4247 #endif
4248   push(rax);                          // save rax,
4249   // addr may contain rsp so we will have to adjust it based on the push
4250   // we just did (and on 64 bit we do two pushes)
4251   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4252   // stores rax into addr which is backwards of what was intended.
4253   if (addr.uses(rsp)) {
4254     lea(rax, addr);
4255     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4256   } else {
4257     pushptr(addr);
4258   }
4259 
4260   ExternalAddress buffer((address) b);
4261   // pass msg argument
4262   // avoid using pushptr, as it modifies scratch registers
4263   // and our contract is not to modify anything
4264   movptr(rax, buffer.addr());
4265   push(rax);
4266 
4267   // call indirectly to solve generation ordering problem
4268   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4269   call(rax);
4270   // Caller pops the arguments (addr, message) and restores rax, r10.
4271 }
4272 
4273 void MacroAssembler::verify_tlab() {
4274 #ifdef ASSERT
4275   if (UseTLAB && VerifyOops) {
4276     Label next, ok;
4277     Register t1 = rsi;
4278     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4279 
4280     push(t1);
4281     NOT_LP64(push(thread_reg));
4282     NOT_LP64(get_thread(thread_reg));
4283 
4284     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4285     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4286     jcc(Assembler::aboveEqual, next);
4287     STOP("assert(top >= start)");
4288     should_not_reach_here();
4289 
4290     bind(next);
4291     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4292     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4293     jcc(Assembler::aboveEqual, ok);
4294     STOP("assert(top <= end)");
4295     should_not_reach_here();
4296 
4297     bind(ok);
4298     NOT_LP64(pop(thread_reg));
4299     pop(t1);
4300   }
4301 #endif
4302 }
4303 
4304 class ControlWord {
4305  public:
4306   int32_t _value;
4307 
4308   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4309   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4310   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4311   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4312   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4313   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4314   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4315   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4316 
4317   void print() const {
4318     // rounding control
4319     const char* rc;
4320     switch (rounding_control()) {
4321       case 0: rc = "round near"; break;
4322       case 1: rc = "round down"; break;
4323       case 2: rc = "round up  "; break;
4324       case 3: rc = "chop      "; break;
4325       default:
4326         rc = NULL; // silence compiler warnings
4327         fatal("Unknown rounding control: %d", rounding_control());
4328     };
4329     // precision control
4330     const char* pc;
4331     switch (precision_control()) {
4332       case 0: pc = "24 bits "; break;
4333       case 1: pc = "reserved"; break;
4334       case 2: pc = "53 bits "; break;
4335       case 3: pc = "64 bits "; break;
4336       default:
4337         pc = NULL; // silence compiler warnings
4338         fatal("Unknown precision control: %d", precision_control());
4339     };
4340     // flags
4341     char f[9];
4342     f[0] = ' ';
4343     f[1] = ' ';
4344     f[2] = (precision   ()) ? 'P' : 'p';
4345     f[3] = (underflow   ()) ? 'U' : 'u';
4346     f[4] = (overflow    ()) ? 'O' : 'o';
4347     f[5] = (zero_divide ()) ? 'Z' : 'z';
4348     f[6] = (denormalized()) ? 'D' : 'd';
4349     f[7] = (invalid     ()) ? 'I' : 'i';
4350     f[8] = '\x0';
4351     // output
4352     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4353   }
4354 
4355 };
4356 
4357 class StatusWord {
4358  public:
4359   int32_t _value;
4360 
4361   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4362   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4363   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4364   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4365   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4366   int  top() const                     { return  (_value >> 11) & 7      ; }
4367   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4368   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4369   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4370   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4371   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4372   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4373   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4374   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4375 
4376   void print() const {
4377     // condition codes
4378     char c[5];
4379     c[0] = (C3()) ? '3' : '-';
4380     c[1] = (C2()) ? '2' : '-';
4381     c[2] = (C1()) ? '1' : '-';
4382     c[3] = (C0()) ? '0' : '-';
4383     c[4] = '\x0';
4384     // flags
4385     char f[9];
4386     f[0] = (error_status()) ? 'E' : '-';
4387     f[1] = (stack_fault ()) ? 'S' : '-';
4388     f[2] = (precision   ()) ? 'P' : '-';
4389     f[3] = (underflow   ()) ? 'U' : '-';
4390     f[4] = (overflow    ()) ? 'O' : '-';
4391     f[5] = (zero_divide ()) ? 'Z' : '-';
4392     f[6] = (denormalized()) ? 'D' : '-';
4393     f[7] = (invalid     ()) ? 'I' : '-';
4394     f[8] = '\x0';
4395     // output
4396     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4397   }
4398 
4399 };
4400 
4401 class TagWord {
4402  public:
4403   int32_t _value;
4404 
4405   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4406 
4407   void print() const {
4408     printf("%04x", _value & 0xFFFF);
4409   }
4410 
4411 };
4412 
4413 class FPU_Register {
4414  public:
4415   int32_t _m0;
4416   int32_t _m1;
4417   int16_t _ex;
4418 
4419   bool is_indefinite() const           {
4420     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4421   }
4422 
4423   void print() const {
4424     char  sign = (_ex < 0) ? '-' : '+';
4425     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4426     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4427   };
4428 
4429 };
4430 
4431 class FPU_State {
4432  public:
4433   enum {
4434     register_size       = 10,
4435     number_of_registers =  8,
4436     register_mask       =  7
4437   };
4438 
4439   ControlWord  _control_word;
4440   StatusWord   _status_word;
4441   TagWord      _tag_word;
4442   int32_t      _error_offset;
4443   int32_t      _error_selector;
4444   int32_t      _data_offset;
4445   int32_t      _data_selector;
4446   int8_t       _register[register_size * number_of_registers];
4447 
4448   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4449   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4450 
4451   const char* tag_as_string(int tag) const {
4452     switch (tag) {
4453       case 0: return "valid";
4454       case 1: return "zero";
4455       case 2: return "special";
4456       case 3: return "empty";
4457     }
4458     ShouldNotReachHere();
4459     return NULL;
4460   }
4461 
4462   void print() const {
4463     // print computation registers
4464     { int t = _status_word.top();
4465       for (int i = 0; i < number_of_registers; i++) {
4466         int j = (i - t) & register_mask;
4467         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4468         st(j)->print();
4469         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4470       }
4471     }
4472     printf("\n");
4473     // print control registers
4474     printf("ctrl = "); _control_word.print(); printf("\n");
4475     printf("stat = "); _status_word .print(); printf("\n");
4476     printf("tags = "); _tag_word    .print(); printf("\n");
4477   }
4478 
4479 };
4480 
4481 class Flag_Register {
4482  public:
4483   int32_t _value;
4484 
4485   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4486   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4487   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4488   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4489   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4490   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4491   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4492 
4493   void print() const {
4494     // flags
4495     char f[8];
4496     f[0] = (overflow       ()) ? 'O' : '-';
4497     f[1] = (direction      ()) ? 'D' : '-';
4498     f[2] = (sign           ()) ? 'S' : '-';
4499     f[3] = (zero           ()) ? 'Z' : '-';
4500     f[4] = (auxiliary_carry()) ? 'A' : '-';
4501     f[5] = (parity         ()) ? 'P' : '-';
4502     f[6] = (carry          ()) ? 'C' : '-';
4503     f[7] = '\x0';
4504     // output
4505     printf("%08x  flags = %s", _value, f);
4506   }
4507 
4508 };
4509 
4510 class IU_Register {
4511  public:
4512   int32_t _value;
4513 
4514   void print() const {
4515     printf("%08x  %11d", _value, _value);
4516   }
4517 
4518 };
4519 
4520 class IU_State {
4521  public:
4522   Flag_Register _eflags;
4523   IU_Register   _rdi;
4524   IU_Register   _rsi;
4525   IU_Register   _rbp;
4526   IU_Register   _rsp;
4527   IU_Register   _rbx;
4528   IU_Register   _rdx;
4529   IU_Register   _rcx;
4530   IU_Register   _rax;
4531 
4532   void print() const {
4533     // computation registers
4534     printf("rax,  = "); _rax.print(); printf("\n");
4535     printf("rbx,  = "); _rbx.print(); printf("\n");
4536     printf("rcx  = "); _rcx.print(); printf("\n");
4537     printf("rdx  = "); _rdx.print(); printf("\n");
4538     printf("rdi  = "); _rdi.print(); printf("\n");
4539     printf("rsi  = "); _rsi.print(); printf("\n");
4540     printf("rbp,  = "); _rbp.print(); printf("\n");
4541     printf("rsp  = "); _rsp.print(); printf("\n");
4542     printf("\n");
4543     // control registers
4544     printf("flgs = "); _eflags.print(); printf("\n");
4545   }
4546 };
4547 
4548 
4549 class CPU_State {
4550  public:
4551   FPU_State _fpu_state;
4552   IU_State  _iu_state;
4553 
4554   void print() const {
4555     printf("--------------------------------------------------\n");
4556     _iu_state .print();
4557     printf("\n");
4558     _fpu_state.print();
4559     printf("--------------------------------------------------\n");
4560   }
4561 
4562 };
4563 
4564 
4565 static void _print_CPU_state(CPU_State* state) {
4566   state->print();
4567 };
4568 
4569 
4570 void MacroAssembler::print_CPU_state() {
4571   push_CPU_state();
4572   push(rsp);                // pass CPU state
4573   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4574   addptr(rsp, wordSize);       // discard argument
4575   pop_CPU_state();
4576 }
4577 
4578 
4579 #ifndef _LP64
4580 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4581   static int counter = 0;
4582   FPU_State* fs = &state->_fpu_state;
4583   counter++;
4584   // For leaf calls, only verify that the top few elements remain empty.
4585   // We only need 1 empty at the top for C2 code.
4586   if( stack_depth < 0 ) {
4587     if( fs->tag_for_st(7) != 3 ) {
4588       printf("FPR7 not empty\n");
4589       state->print();
4590       assert(false, "error");
4591       return false;
4592     }
4593     return true;                // All other stack states do not matter
4594   }
4595 
4596   assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
4597          "bad FPU control word");
4598 
4599   // compute stack depth
4600   int i = 0;
4601   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4602   int d = i;
4603   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4604   // verify findings
4605   if (i != FPU_State::number_of_registers) {
4606     // stack not contiguous
4607     printf("%s: stack not contiguous at ST%d\n", s, i);
4608     state->print();
4609     assert(false, "error");
4610     return false;
4611   }
4612   // check if computed stack depth corresponds to expected stack depth
4613   if (stack_depth < 0) {
4614     // expected stack depth is -stack_depth or less
4615     if (d > -stack_depth) {
4616       // too many elements on the stack
4617       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4618       state->print();
4619       assert(false, "error");
4620       return false;
4621     }
4622   } else {
4623     // expected stack depth is stack_depth
4624     if (d != stack_depth) {
4625       // wrong stack depth
4626       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4627       state->print();
4628       assert(false, "error");
4629       return false;
4630     }
4631   }
4632   // everything is cool
4633   return true;
4634 }
4635 
4636 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4637   if (!VerifyFPU) return;
4638   push_CPU_state();
4639   push(rsp);                // pass CPU state
4640   ExternalAddress msg((address) s);
4641   // pass message string s
4642   pushptr(msg.addr());
4643   push(stack_depth);        // pass stack depth
4644   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4645   addptr(rsp, 3 * wordSize);   // discard arguments
4646   // check for error
4647   { Label L;
4648     testl(rax, rax);
4649     jcc(Assembler::notZero, L);
4650     int3();                  // break if error condition
4651     bind(L);
4652   }
4653   pop_CPU_state();
4654 }
4655 #endif // _LP64
4656 
4657 void MacroAssembler::restore_cpu_control_state_after_jni() {
4658   // Either restore the MXCSR register after returning from the JNI Call
4659   // or verify that it wasn't changed (with -Xcheck:jni flag).
4660   if (VM_Version::supports_sse()) {
4661     if (RestoreMXCSROnJNICalls) {
4662       ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()));
4663     } else if (CheckJNICalls) {
4664       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4665     }
4666   }
4667   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4668   vzeroupper();
4669   // Reset k1 to 0xffff.
4670 
4671 #ifdef COMPILER2
4672   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
4673     push(rcx);
4674     movl(rcx, 0xffff);
4675     kmovwl(k1, rcx);
4676     pop(rcx);
4677   }
4678 #endif // COMPILER2
4679 
4680 #ifndef _LP64
4681   // Either restore the x87 floating pointer control word after returning
4682   // from the JNI call or verify that it wasn't changed.
4683   if (CheckJNICalls) {
4684     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4685   }
4686 #endif // _LP64
4687 }
4688 
4689 // ((OopHandle)result).resolve();
4690 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4691   assert_different_registers(result, tmp);
4692 
4693   // Only 64 bit platforms support GCs that require a tmp register
4694   // Only IN_HEAP loads require a thread_tmp register
4695   // OopHandle::resolve is an indirection like jobject.
4696   access_load_at(T_OBJECT, IN_NATIVE,
4697                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4698 }
4699 
4700 // ((WeakHandle)result).resolve();
4701 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4702   assert_different_registers(rresult, rtmp);
4703   Label resolved;
4704 
4705   // A null weak handle resolves to null.
4706   cmpptr(rresult, 0);
4707   jcc(Assembler::equal, resolved);
4708 
4709   // Only 64 bit platforms support GCs that require a tmp register
4710   // Only IN_HEAP loads require a thread_tmp register
4711   // WeakHandle::resolve is an indirection like jweak.
4712   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4713                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4714   bind(resolved);
4715 }
4716 
4717 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4718   // get mirror
4719   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4720   load_method_holder(mirror, method);
4721   movptr(mirror, Address(mirror, mirror_offset));
4722   resolve_oop_handle(mirror, tmp);
4723 }
4724 
4725 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4726   load_method_holder(rresult, rmethod);
4727   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4728 }
4729 
4730 void MacroAssembler::load_method_holder(Register holder, Register method) {
4731   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4732   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4733   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4734 }
4735 
4736 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
4737   assert_different_registers(src, tmp);
4738   assert_different_registers(dst, tmp);
4739 #ifdef _LP64
4740   if (UseCompressedClassPointers) {
4741     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4742     decode_klass_not_null(dst, tmp);
4743   } else
4744 #endif
4745     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4746 }
4747 
4748 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
4749   load_klass(dst, src, tmp);
4750   movptr(dst, Address(dst, Klass::prototype_header_offset()));
4751 }
4752 
4753 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4754   assert_different_registers(src, tmp);
4755   assert_different_registers(dst, tmp);
4756 #ifdef _LP64
4757   if (UseCompressedClassPointers) {
4758     encode_klass_not_null(src, tmp);
4759     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4760   } else
4761 #endif
4762     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4763 }
4764 
4765 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4766                                     Register tmp1, Register thread_tmp) {
4767   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4768   decorators = AccessInternal::decorator_fixup(decorators);
4769   bool as_raw = (decorators & AS_RAW) != 0;
4770   if (as_raw) {
4771     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4772   } else {
4773     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4774   }
4775 }
4776 
4777 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4778                                      Register tmp1, Register tmp2) {
4779   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4780   decorators = AccessInternal::decorator_fixup(decorators);
4781   bool as_raw = (decorators & AS_RAW) != 0;
4782   if (as_raw) {
4783     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
4784   } else {
4785     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
4786   }
4787 }
4788 
4789 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4790                                    Register thread_tmp, DecoratorSet decorators) {
4791   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4792 }
4793 
4794 // Doesn't do verfication, generates fixed size code
4795 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4796                                             Register thread_tmp, DecoratorSet decorators) {
4797   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4798 }
4799 
4800 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4801                                     Register tmp2, DecoratorSet decorators) {
4802   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4803 }
4804 
4805 // Used for storing NULLs.
4806 void MacroAssembler::store_heap_oop_null(Address dst) {
4807   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4808 }
4809 
4810 #ifdef _LP64
4811 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4812   if (UseCompressedClassPointers) {
4813     // Store to klass gap in destination
4814     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4815   }
4816 }
4817 
4818 #ifdef ASSERT
4819 void MacroAssembler::verify_heapbase(const char* msg) {
4820   assert (UseCompressedOops, "should be compressed");
4821   assert (Universe::heap() != NULL, "java heap should be initialized");
4822   if (CheckCompressedOops) {
4823     Label ok;
4824     push(rscratch1); // cmpptr trashes rscratch1
4825     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4826     jcc(Assembler::equal, ok);
4827     STOP(msg);
4828     bind(ok);
4829     pop(rscratch1);
4830   }
4831 }
4832 #endif
4833 
4834 // Algorithm must match oop.inline.hpp encode_heap_oop.
4835 void MacroAssembler::encode_heap_oop(Register r) {
4836 #ifdef ASSERT
4837   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4838 #endif
4839   verify_oop_msg(r, "broken oop in encode_heap_oop");
4840   if (CompressedOops::base() == NULL) {
4841     if (CompressedOops::shift() != 0) {
4842       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4843       shrq(r, LogMinObjAlignmentInBytes);
4844     }
4845     return;
4846   }
4847   testq(r, r);
4848   cmovq(Assembler::equal, r, r12_heapbase);
4849   subq(r, r12_heapbase);
4850   shrq(r, LogMinObjAlignmentInBytes);
4851 }
4852 
4853 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4854 #ifdef ASSERT
4855   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4856   if (CheckCompressedOops) {
4857     Label ok;
4858     testq(r, r);
4859     jcc(Assembler::notEqual, ok);
4860     STOP("null oop passed to encode_heap_oop_not_null");
4861     bind(ok);
4862   }
4863 #endif
4864   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4865   if (CompressedOops::base() != NULL) {
4866     subq(r, r12_heapbase);
4867   }
4868   if (CompressedOops::shift() != 0) {
4869     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4870     shrq(r, LogMinObjAlignmentInBytes);
4871   }
4872 }
4873 
4874 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4875 #ifdef ASSERT
4876   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4877   if (CheckCompressedOops) {
4878     Label ok;
4879     testq(src, src);
4880     jcc(Assembler::notEqual, ok);
4881     STOP("null oop passed to encode_heap_oop_not_null2");
4882     bind(ok);
4883   }
4884 #endif
4885   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4886   if (dst != src) {
4887     movq(dst, src);
4888   }
4889   if (CompressedOops::base() != NULL) {
4890     subq(dst, r12_heapbase);
4891   }
4892   if (CompressedOops::shift() != 0) {
4893     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4894     shrq(dst, LogMinObjAlignmentInBytes);
4895   }
4896 }
4897 
4898 void  MacroAssembler::decode_heap_oop(Register r) {
4899 #ifdef ASSERT
4900   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4901 #endif
4902   if (CompressedOops::base() == NULL) {
4903     if (CompressedOops::shift() != 0) {
4904       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4905       shlq(r, LogMinObjAlignmentInBytes);
4906     }
4907   } else {
4908     Label done;
4909     shlq(r, LogMinObjAlignmentInBytes);
4910     jccb(Assembler::equal, done);
4911     addq(r, r12_heapbase);
4912     bind(done);
4913   }
4914   verify_oop_msg(r, "broken oop in decode_heap_oop");
4915 }
4916 
4917 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4918   // Note: it will change flags
4919   assert (UseCompressedOops, "should only be used for compressed headers");
4920   assert (Universe::heap() != NULL, "java heap should be initialized");
4921   // Cannot assert, unverified entry point counts instructions (see .ad file)
4922   // vtableStubs also counts instructions in pd_code_size_limit.
4923   // Also do not verify_oop as this is called by verify_oop.
4924   if (CompressedOops::shift() != 0) {
4925     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4926     shlq(r, LogMinObjAlignmentInBytes);
4927     if (CompressedOops::base() != NULL) {
4928       addq(r, r12_heapbase);
4929     }
4930   } else {
4931     assert (CompressedOops::base() == NULL, "sanity");
4932   }
4933 }
4934 
4935 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4936   // Note: it will change flags
4937   assert (UseCompressedOops, "should only be used for compressed headers");
4938   assert (Universe::heap() != NULL, "java heap should be initialized");
4939   // Cannot assert, unverified entry point counts instructions (see .ad file)
4940   // vtableStubs also counts instructions in pd_code_size_limit.
4941   // Also do not verify_oop as this is called by verify_oop.
4942   if (CompressedOops::shift() != 0) {
4943     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4944     if (LogMinObjAlignmentInBytes == Address::times_8) {
4945       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
4946     } else {
4947       if (dst != src) {
4948         movq(dst, src);
4949       }
4950       shlq(dst, LogMinObjAlignmentInBytes);
4951       if (CompressedOops::base() != NULL) {
4952         addq(dst, r12_heapbase);
4953       }
4954     }
4955   } else {
4956     assert (CompressedOops::base() == NULL, "sanity");
4957     if (dst != src) {
4958       movq(dst, src);
4959     }
4960   }
4961 }
4962 
4963 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
4964   assert_different_registers(r, tmp);
4965   if (CompressedKlassPointers::base() != NULL) {
4966     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4967     subq(r, tmp);
4968   }
4969   if (CompressedKlassPointers::shift() != 0) {
4970     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4971     shrq(r, LogKlassAlignmentInBytes);
4972   }
4973 }
4974 
4975 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
4976   assert_different_registers(src, dst);
4977   if (CompressedKlassPointers::base() != NULL) {
4978     mov64(dst, -(int64_t)CompressedKlassPointers::base());
4979     addq(dst, src);
4980   } else {
4981     movptr(dst, src);
4982   }
4983   if (CompressedKlassPointers::shift() != 0) {
4984     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4985     shrq(dst, LogKlassAlignmentInBytes);
4986   }
4987 }
4988 
4989 // !!! If the instructions that get generated here change then function
4990 // instr_size_for_decode_klass_not_null() needs to get updated.
4991 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
4992   assert_different_registers(r, tmp);
4993   // Note: it will change flags
4994   assert(UseCompressedClassPointers, "should only be used for compressed headers");
4995   // Cannot assert, unverified entry point counts instructions (see .ad file)
4996   // vtableStubs also counts instructions in pd_code_size_limit.
4997   // Also do not verify_oop as this is called by verify_oop.
4998   if (CompressedKlassPointers::shift() != 0) {
4999     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5000     shlq(r, LogKlassAlignmentInBytes);
5001   }
5002   if (CompressedKlassPointers::base() != NULL) {
5003     mov64(tmp, (int64_t)CompressedKlassPointers::base());
5004     addq(r, tmp);
5005   }
5006 }
5007 
5008 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5009   assert_different_registers(src, dst);
5010   // Note: it will change flags
5011   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5012   // Cannot assert, unverified entry point counts instructions (see .ad file)
5013   // vtableStubs also counts instructions in pd_code_size_limit.
5014   // Also do not verify_oop as this is called by verify_oop.
5015 
5016   if (CompressedKlassPointers::base() == NULL &&
5017       CompressedKlassPointers::shift() == 0) {
5018     // The best case scenario is that there is no base or shift. Then it is already
5019     // a pointer that needs nothing but a register rename.
5020     movl(dst, src);
5021   } else {
5022     if (CompressedKlassPointers::base() != NULL) {
5023       mov64(dst, (int64_t)CompressedKlassPointers::base());
5024     } else {
5025       xorq(dst, dst);
5026     }
5027     if (CompressedKlassPointers::shift() != 0) {
5028       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5029       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5030       leaq(dst, Address(dst, src, Address::times_8, 0));
5031     } else {
5032       addq(dst, src);
5033     }
5034   }
5035 }
5036 
5037 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5038   assert (UseCompressedOops, "should only be used for compressed headers");
5039   assert (Universe::heap() != NULL, "java heap should be initialized");
5040   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5041   int oop_index = oop_recorder()->find_index(obj);
5042   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5043   mov_narrow_oop(dst, oop_index, rspec);
5044 }
5045 
5046 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5047   assert (UseCompressedOops, "should only be used for compressed headers");
5048   assert (Universe::heap() != NULL, "java heap should be initialized");
5049   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5050   int oop_index = oop_recorder()->find_index(obj);
5051   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5052   mov_narrow_oop(dst, oop_index, rspec);
5053 }
5054 
5055 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5056   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5057   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5058   int klass_index = oop_recorder()->find_index(k);
5059   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5060   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5061 }
5062 
5063 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5064   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5065   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5066   int klass_index = oop_recorder()->find_index(k);
5067   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5068   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5069 }
5070 
5071 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5072   assert (UseCompressedOops, "should only be used for compressed headers");
5073   assert (Universe::heap() != NULL, "java heap should be initialized");
5074   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5075   int oop_index = oop_recorder()->find_index(obj);
5076   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5077   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5078 }
5079 
5080 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5081   assert (UseCompressedOops, "should only be used for compressed headers");
5082   assert (Universe::heap() != NULL, "java heap should be initialized");
5083   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5084   int oop_index = oop_recorder()->find_index(obj);
5085   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5086   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5087 }
5088 
5089 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5090   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5091   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5092   int klass_index = oop_recorder()->find_index(k);
5093   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5094   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5095 }
5096 
5097 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5098   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5099   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5100   int klass_index = oop_recorder()->find_index(k);
5101   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5102   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5103 }
5104 
5105 void MacroAssembler::reinit_heapbase() {
5106   if (UseCompressedOops) {
5107     if (Universe::heap() != NULL) {
5108       if (CompressedOops::base() == NULL) {
5109         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5110       } else {
5111         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5112       }
5113     } else {
5114       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5115     }
5116   }
5117 }
5118 
5119 #endif // _LP64
5120 
5121 // C2 compiled method's prolog code.
5122 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5123 
5124   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5125   // NativeJump::patch_verified_entry will be able to patch out the entry
5126   // code safely. The push to verify stack depth is ok at 5 bytes,
5127   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5128   // stack bang then we must use the 6 byte frame allocation even if
5129   // we have no frame. :-(
5130   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5131 
5132   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5133   // Remove word for return addr
5134   framesize -= wordSize;
5135   stack_bang_size -= wordSize;
5136 
5137   // Calls to C2R adapters often do not accept exceptional returns.
5138   // We require that their callers must bang for them.  But be careful, because
5139   // some VM calls (such as call site linkage) can use several kilobytes of
5140   // stack.  But the stack safety zone should account for that.
5141   // See bugs 4446381, 4468289, 4497237.
5142   if (stack_bang_size > 0) {
5143     generate_stack_overflow_check(stack_bang_size);
5144 
5145     // We always push rbp, so that on return to interpreter rbp, will be
5146     // restored correctly and we can correct the stack.
5147     push(rbp);
5148     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5149     if (PreserveFramePointer) {
5150       mov(rbp, rsp);
5151     }
5152     // Remove word for ebp
5153     framesize -= wordSize;
5154 
5155     // Create frame
5156     if (framesize) {
5157       subptr(rsp, framesize);
5158     }
5159   } else {
5160     // Create frame (force generation of a 4 byte immediate value)
5161     subptr_imm32(rsp, framesize);
5162 
5163     // Save RBP register now.
5164     framesize -= wordSize;
5165     movptr(Address(rsp, framesize), rbp);
5166     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5167     if (PreserveFramePointer) {
5168       movptr(rbp, rsp);
5169       if (framesize > 0) {
5170         addptr(rbp, framesize);
5171       }
5172     }
5173   }
5174 
5175   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5176     framesize -= wordSize;
5177     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5178   }
5179 
5180 #ifndef _LP64
5181   // If method sets FPU control word do it now
5182   if (fp_mode_24b) {
5183     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
5184   }
5185   if (UseSSE >= 2 && VerifyFPU) {
5186     verify_FPU(0, "FPU stack must be clean on entry");
5187   }
5188 #endif
5189 
5190 #ifdef ASSERT
5191   if (VerifyStackAtCalls) {
5192     Label L;
5193     push(rax);
5194     mov(rax, rsp);
5195     andptr(rax, StackAlignmentInBytes-1);
5196     cmpptr(rax, StackAlignmentInBytes-wordSize);
5197     pop(rax);
5198     jcc(Assembler::equal, L);
5199     STOP("Stack is not properly aligned!");
5200     bind(L);
5201   }
5202 #endif
5203 
5204   if (!is_stub) {
5205     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5206     bs->nmethod_entry_barrier(this);
5207   }
5208 }
5209 
5210 #if COMPILER2_OR_JVMCI
5211 
5212 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
5213 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5214   // cnt - number of qwords (8-byte words).
5215   // base - start address, qword aligned.
5216   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5217   bool use64byteVector = MaxVectorSize == 64 && AVX3Threshold == 0;
5218   if (use64byteVector) {
5219     vpxor(xtmp, xtmp, xtmp, AVX_512bit);
5220   } else if (MaxVectorSize >= 32) {
5221     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5222   } else {
5223     pxor(xtmp, xtmp);
5224   }
5225   jmp(L_zero_64_bytes);
5226 
5227   BIND(L_loop);
5228   if (MaxVectorSize >= 32) {
5229     fill64_avx(base, 0, xtmp, use64byteVector);
5230   } else {
5231     movdqu(Address(base,  0), xtmp);
5232     movdqu(Address(base, 16), xtmp);
5233     movdqu(Address(base, 32), xtmp);
5234     movdqu(Address(base, 48), xtmp);
5235   }
5236   addptr(base, 64);
5237 
5238   BIND(L_zero_64_bytes);
5239   subptr(cnt, 8);
5240   jccb(Assembler::greaterEqual, L_loop);
5241 
5242   // Copy trailing 64 bytes
5243   if (use64byteVector) {
5244     addptr(cnt, 8);
5245     jccb(Assembler::equal, L_end);
5246     fill64_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp, true);
5247     jmp(L_end);
5248   } else {
5249     addptr(cnt, 4);
5250     jccb(Assembler::less, L_tail);
5251     if (MaxVectorSize >= 32) {
5252       vmovdqu(Address(base, 0), xtmp);
5253     } else {
5254       movdqu(Address(base,  0), xtmp);
5255       movdqu(Address(base, 16), xtmp);
5256     }
5257   }
5258   addptr(base, 32);
5259   subptr(cnt, 4);
5260 
5261   BIND(L_tail);
5262   addptr(cnt, 4);
5263   jccb(Assembler::lessEqual, L_end);
5264   if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
5265     fill32_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp);
5266   } else {
5267     decrement(cnt);
5268 
5269     BIND(L_sloop);
5270     movq(Address(base, 0), xtmp);
5271     addptr(base, 8);
5272     decrement(cnt);
5273     jccb(Assembler::greaterEqual, L_sloop);
5274   }
5275   BIND(L_end);
5276 }
5277 
5278 // Clearing constant sized memory using YMM/ZMM registers.
5279 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
5280   assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
5281   bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
5282 
5283   int vector64_count = (cnt & (~0x7)) >> 3;
5284   cnt = cnt & 0x7;
5285   const int fill64_per_loop = 4;
5286   const int max_unrolled_fill64 = 8;
5287 
5288   // 64 byte initialization loop.
5289   vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
5290   int start64 = 0;
5291   if (vector64_count > max_unrolled_fill64) {
5292     Label LOOP;
5293     Register index = rtmp;
5294 
5295     start64 = vector64_count - (vector64_count % fill64_per_loop);
5296 
5297     movl(index, 0);
5298     BIND(LOOP);
5299     for (int i = 0; i < fill64_per_loop; i++) {
5300       fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
5301     }
5302     addl(index, fill64_per_loop * 64);
5303     cmpl(index, start64 * 64);
5304     jccb(Assembler::less, LOOP);
5305   }
5306   for (int i = start64; i < vector64_count; i++) {
5307     fill64_avx(base, i * 64, xtmp, use64byteVector);
5308   }
5309 
5310   // Clear remaining 64 byte tail.
5311   int disp = vector64_count * 64;
5312   if (cnt) {
5313     switch (cnt) {
5314       case 1:
5315         movq(Address(base, disp), xtmp);
5316         break;
5317       case 2:
5318         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit);
5319         break;
5320       case 3:
5321         movl(rtmp, 0x7);
5322         kmovwl(mask, rtmp);
5323         evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit);
5324         break;
5325       case 4:
5326         evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5327         break;
5328       case 5:
5329         if (use64byteVector) {
5330           movl(rtmp, 0x1F);
5331           kmovwl(mask, rtmp);
5332           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5333         } else {
5334           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5335           movq(Address(base, disp + 32), xtmp);
5336         }
5337         break;
5338       case 6:
5339         if (use64byteVector) {
5340           movl(rtmp, 0x3F);
5341           kmovwl(mask, rtmp);
5342           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5343         } else {
5344           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5345           evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit);
5346         }
5347         break;
5348       case 7:
5349         if (use64byteVector) {
5350           movl(rtmp, 0x7F);
5351           kmovwl(mask, rtmp);
5352           evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
5353         } else {
5354           evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
5355           movl(rtmp, 0x7);
5356           kmovwl(mask, rtmp);
5357           evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit);
5358         }
5359         break;
5360       default:
5361         fatal("Unexpected length : %d\n",cnt);
5362         break;
5363     }
5364   }
5365 }
5366 
5367 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
5368                                bool is_large, KRegister mask) {
5369   // cnt      - number of qwords (8-byte words).
5370   // base     - start address, qword aligned.
5371   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5372   assert(base==rdi, "base register must be edi for rep stos");
5373   assert(tmp==rax,   "tmp register must be eax for rep stos");
5374   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5375   assert(InitArrayShortSize % BytesPerLong == 0,
5376     "InitArrayShortSize should be the multiple of BytesPerLong");
5377 
5378   Label DONE;
5379   if (!is_large || !UseXMMForObjInit) {
5380     xorptr(tmp, tmp);
5381   }
5382 
5383   if (!is_large) {
5384     Label LOOP, LONG;
5385     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5386     jccb(Assembler::greater, LONG);
5387 
5388     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5389 
5390     decrement(cnt);
5391     jccb(Assembler::negative, DONE); // Zero length
5392 
5393     // Use individual pointer-sized stores for small counts:
5394     BIND(LOOP);
5395     movptr(Address(base, cnt, Address::times_ptr), tmp);
5396     decrement(cnt);
5397     jccb(Assembler::greaterEqual, LOOP);
5398     jmpb(DONE);
5399 
5400     BIND(LONG);
5401   }
5402 
5403   // Use longer rep-prefixed ops for non-small counts:
5404   if (UseFastStosb) {
5405     shlptr(cnt, 3); // convert to number of bytes
5406     rep_stosb();
5407   } else if (UseXMMForObjInit) {
5408     xmm_clear_mem(base, cnt, tmp, xtmp, mask);
5409   } else {
5410     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5411     rep_stos();
5412   }
5413 
5414   BIND(DONE);
5415 }
5416 
5417 #endif //COMPILER2_OR_JVMCI
5418 
5419 
5420 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5421                                    Register to, Register value, Register count,
5422                                    Register rtmp, XMMRegister xtmp) {
5423   ShortBranchVerifier sbv(this);
5424   assert_different_registers(to, value, count, rtmp);
5425   Label L_exit;
5426   Label L_fill_2_bytes, L_fill_4_bytes;
5427 
5428   int shift = -1;
5429   switch (t) {
5430     case T_BYTE:
5431       shift = 2;
5432       break;
5433     case T_SHORT:
5434       shift = 1;
5435       break;
5436     case T_INT:
5437       shift = 0;
5438       break;
5439     default: ShouldNotReachHere();
5440   }
5441 
5442   if (t == T_BYTE) {
5443     andl(value, 0xff);
5444     movl(rtmp, value);
5445     shll(rtmp, 8);
5446     orl(value, rtmp);
5447   }
5448   if (t == T_SHORT) {
5449     andl(value, 0xffff);
5450   }
5451   if (t == T_BYTE || t == T_SHORT) {
5452     movl(rtmp, value);
5453     shll(rtmp, 16);
5454     orl(value, rtmp);
5455   }
5456 
5457   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5458   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5459   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5460     Label L_skip_align2;
5461     // align source address at 4 bytes address boundary
5462     if (t == T_BYTE) {
5463       Label L_skip_align1;
5464       // One byte misalignment happens only for byte arrays
5465       testptr(to, 1);
5466       jccb(Assembler::zero, L_skip_align1);
5467       movb(Address(to, 0), value);
5468       increment(to);
5469       decrement(count);
5470       BIND(L_skip_align1);
5471     }
5472     // Two bytes misalignment happens only for byte and short (char) arrays
5473     testptr(to, 2);
5474     jccb(Assembler::zero, L_skip_align2);
5475     movw(Address(to, 0), value);
5476     addptr(to, 2);
5477     subl(count, 1<<(shift-1));
5478     BIND(L_skip_align2);
5479   }
5480   if (UseSSE < 2) {
5481     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5482     // Fill 32-byte chunks
5483     subl(count, 8 << shift);
5484     jcc(Assembler::less, L_check_fill_8_bytes);
5485     align(16);
5486 
5487     BIND(L_fill_32_bytes_loop);
5488 
5489     for (int i = 0; i < 32; i += 4) {
5490       movl(Address(to, i), value);
5491     }
5492 
5493     addptr(to, 32);
5494     subl(count, 8 << shift);
5495     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5496     BIND(L_check_fill_8_bytes);
5497     addl(count, 8 << shift);
5498     jccb(Assembler::zero, L_exit);
5499     jmpb(L_fill_8_bytes);
5500 
5501     //
5502     // length is too short, just fill qwords
5503     //
5504     BIND(L_fill_8_bytes_loop);
5505     movl(Address(to, 0), value);
5506     movl(Address(to, 4), value);
5507     addptr(to, 8);
5508     BIND(L_fill_8_bytes);
5509     subl(count, 1 << (shift + 1));
5510     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5511     // fall through to fill 4 bytes
5512   } else {
5513     Label L_fill_32_bytes;
5514     if (!UseUnalignedLoadStores) {
5515       // align to 8 bytes, we know we are 4 byte aligned to start
5516       testptr(to, 4);
5517       jccb(Assembler::zero, L_fill_32_bytes);
5518       movl(Address(to, 0), value);
5519       addptr(to, 4);
5520       subl(count, 1<<shift);
5521     }
5522     BIND(L_fill_32_bytes);
5523     {
5524       assert( UseSSE >= 2, "supported cpu only" );
5525       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5526       movdl(xtmp, value);
5527       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5528         Label L_check_fill_32_bytes;
5529         if (UseAVX > 2) {
5530           // Fill 64-byte chunks
5531           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5532 
5533           // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
5534           cmpl(count, AVX3Threshold);
5535           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5536 
5537           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5538 
5539           subl(count, 16 << shift);
5540           jccb(Assembler::less, L_check_fill_32_bytes);
5541           align(16);
5542 
5543           BIND(L_fill_64_bytes_loop_avx3);
5544           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5545           addptr(to, 64);
5546           subl(count, 16 << shift);
5547           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5548           jmpb(L_check_fill_32_bytes);
5549 
5550           BIND(L_check_fill_64_bytes_avx2);
5551         }
5552         // Fill 64-byte chunks
5553         Label L_fill_64_bytes_loop;
5554         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5555 
5556         subl(count, 16 << shift);
5557         jcc(Assembler::less, L_check_fill_32_bytes);
5558         align(16);
5559 
5560         BIND(L_fill_64_bytes_loop);
5561         vmovdqu(Address(to, 0), xtmp);
5562         vmovdqu(Address(to, 32), xtmp);
5563         addptr(to, 64);
5564         subl(count, 16 << shift);
5565         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5566 
5567         BIND(L_check_fill_32_bytes);
5568         addl(count, 8 << shift);
5569         jccb(Assembler::less, L_check_fill_8_bytes);
5570         vmovdqu(Address(to, 0), xtmp);
5571         addptr(to, 32);
5572         subl(count, 8 << shift);
5573 
5574         BIND(L_check_fill_8_bytes);
5575         // clean upper bits of YMM registers
5576         movdl(xtmp, value);
5577         pshufd(xtmp, xtmp, 0);
5578       } else {
5579         // Fill 32-byte chunks
5580         pshufd(xtmp, xtmp, 0);
5581 
5582         subl(count, 8 << shift);
5583         jcc(Assembler::less, L_check_fill_8_bytes);
5584         align(16);
5585 
5586         BIND(L_fill_32_bytes_loop);
5587 
5588         if (UseUnalignedLoadStores) {
5589           movdqu(Address(to, 0), xtmp);
5590           movdqu(Address(to, 16), xtmp);
5591         } else {
5592           movq(Address(to, 0), xtmp);
5593           movq(Address(to, 8), xtmp);
5594           movq(Address(to, 16), xtmp);
5595           movq(Address(to, 24), xtmp);
5596         }
5597 
5598         addptr(to, 32);
5599         subl(count, 8 << shift);
5600         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5601 
5602         BIND(L_check_fill_8_bytes);
5603       }
5604       addl(count, 8 << shift);
5605       jccb(Assembler::zero, L_exit);
5606       jmpb(L_fill_8_bytes);
5607 
5608       //
5609       // length is too short, just fill qwords
5610       //
5611       BIND(L_fill_8_bytes_loop);
5612       movq(Address(to, 0), xtmp);
5613       addptr(to, 8);
5614       BIND(L_fill_8_bytes);
5615       subl(count, 1 << (shift + 1));
5616       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5617     }
5618   }
5619   // fill trailing 4 bytes
5620   BIND(L_fill_4_bytes);
5621   testl(count, 1<<shift);
5622   jccb(Assembler::zero, L_fill_2_bytes);
5623   movl(Address(to, 0), value);
5624   if (t == T_BYTE || t == T_SHORT) {
5625     Label L_fill_byte;
5626     addptr(to, 4);
5627     BIND(L_fill_2_bytes);
5628     // fill trailing 2 bytes
5629     testl(count, 1<<(shift-1));
5630     jccb(Assembler::zero, L_fill_byte);
5631     movw(Address(to, 0), value);
5632     if (t == T_BYTE) {
5633       addptr(to, 2);
5634       BIND(L_fill_byte);
5635       // fill trailing byte
5636       testl(count, 1);
5637       jccb(Assembler::zero, L_exit);
5638       movb(Address(to, 0), value);
5639     } else {
5640       BIND(L_fill_byte);
5641     }
5642   } else {
5643     BIND(L_fill_2_bytes);
5644   }
5645   BIND(L_exit);
5646 }
5647 
5648 // encode char[] to byte[] in ISO_8859_1 or ASCII
5649    //@IntrinsicCandidate
5650    //private static int implEncodeISOArray(byte[] sa, int sp,
5651    //byte[] da, int dp, int len) {
5652    //  int i = 0;
5653    //  for (; i < len; i++) {
5654    //    char c = StringUTF16.getChar(sa, sp++);
5655    //    if (c > '\u00FF')
5656    //      break;
5657    //    da[dp++] = (byte)c;
5658    //  }
5659    //  return i;
5660    //}
5661    //
5662    //@IntrinsicCandidate
5663    //private static int implEncodeAsciiArray(char[] sa, int sp,
5664    //    byte[] da, int dp, int len) {
5665    //  int i = 0;
5666    //  for (; i < len; i++) {
5667    //    char c = sa[sp++];
5668    //    if (c >= '\u0080')
5669    //      break;
5670    //    da[dp++] = (byte)c;
5671    //  }
5672    //  return i;
5673    //}
5674 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5675   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5676   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5677   Register tmp5, Register result, bool ascii) {
5678 
5679   // rsi: src
5680   // rdi: dst
5681   // rdx: len
5682   // rcx: tmp5
5683   // rax: result
5684   ShortBranchVerifier sbv(this);
5685   assert_different_registers(src, dst, len, tmp5, result);
5686   Label L_done, L_copy_1_char, L_copy_1_char_exit;
5687 
5688   int mask = ascii ? 0xff80ff80 : 0xff00ff00;
5689   int short_mask = ascii ? 0xff80 : 0xff00;
5690 
5691   // set result
5692   xorl(result, result);
5693   // check for zero length
5694   testl(len, len);
5695   jcc(Assembler::zero, L_done);
5696 
5697   movl(result, len);
5698 
5699   // Setup pointers
5700   lea(src, Address(src, len, Address::times_2)); // char[]
5701   lea(dst, Address(dst, len, Address::times_1)); // byte[]
5702   negptr(len);
5703 
5704   if (UseSSE42Intrinsics || UseAVX >= 2) {
5705     Label L_copy_8_chars, L_copy_8_chars_exit;
5706     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5707 
5708     if (UseAVX >= 2) {
5709       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5710       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5711       movdl(tmp1Reg, tmp5);
5712       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5713       jmp(L_chars_32_check);
5714 
5715       bind(L_copy_32_chars);
5716       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5717       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5718       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5719       vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5720       jccb(Assembler::notZero, L_copy_32_chars_exit);
5721       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5722       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5723       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5724 
5725       bind(L_chars_32_check);
5726       addptr(len, 32);
5727       jcc(Assembler::lessEqual, L_copy_32_chars);
5728 
5729       bind(L_copy_32_chars_exit);
5730       subptr(len, 16);
5731       jccb(Assembler::greater, L_copy_16_chars_exit);
5732 
5733     } else if (UseSSE42Intrinsics) {
5734       movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
5735       movdl(tmp1Reg, tmp5);
5736       pshufd(tmp1Reg, tmp1Reg, 0);
5737       jmpb(L_chars_16_check);
5738     }
5739 
5740     bind(L_copy_16_chars);
5741     if (UseAVX >= 2) {
5742       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5743       vptest(tmp2Reg, tmp1Reg);
5744       jcc(Assembler::notZero, L_copy_16_chars_exit);
5745       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5746       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5747     } else {
5748       if (UseAVX > 0) {
5749         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5750         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5751         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5752       } else {
5753         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5754         por(tmp2Reg, tmp3Reg);
5755         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5756         por(tmp2Reg, tmp4Reg);
5757       }
5758       ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
5759       jccb(Assembler::notZero, L_copy_16_chars_exit);
5760       packuswb(tmp3Reg, tmp4Reg);
5761     }
5762     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5763 
5764     bind(L_chars_16_check);
5765     addptr(len, 16);
5766     jcc(Assembler::lessEqual, L_copy_16_chars);
5767 
5768     bind(L_copy_16_chars_exit);
5769     if (UseAVX >= 2) {
5770       // clean upper bits of YMM registers
5771       vpxor(tmp2Reg, tmp2Reg);
5772       vpxor(tmp3Reg, tmp3Reg);
5773       vpxor(tmp4Reg, tmp4Reg);
5774       movdl(tmp1Reg, tmp5);
5775       pshufd(tmp1Reg, tmp1Reg, 0);
5776     }
5777     subptr(len, 8);
5778     jccb(Assembler::greater, L_copy_8_chars_exit);
5779 
5780     bind(L_copy_8_chars);
5781     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5782     ptest(tmp3Reg, tmp1Reg);
5783     jccb(Assembler::notZero, L_copy_8_chars_exit);
5784     packuswb(tmp3Reg, tmp1Reg);
5785     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5786     addptr(len, 8);
5787     jccb(Assembler::lessEqual, L_copy_8_chars);
5788 
5789     bind(L_copy_8_chars_exit);
5790     subptr(len, 8);
5791     jccb(Assembler::zero, L_done);
5792   }
5793 
5794   bind(L_copy_1_char);
5795   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5796   testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
5797   jccb(Assembler::notZero, L_copy_1_char_exit);
5798   movb(Address(dst, len, Address::times_1, 0), tmp5);
5799   addptr(len, 1);
5800   jccb(Assembler::less, L_copy_1_char);
5801 
5802   bind(L_copy_1_char_exit);
5803   addptr(result, len); // len is negative count of not processed elements
5804 
5805   bind(L_done);
5806 }
5807 
5808 #ifdef _LP64
5809 /**
5810  * Helper for multiply_to_len().
5811  */
5812 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5813   addq(dest_lo, src1);
5814   adcq(dest_hi, 0);
5815   addq(dest_lo, src2);
5816   adcq(dest_hi, 0);
5817 }
5818 
5819 /**
5820  * Multiply 64 bit by 64 bit first loop.
5821  */
5822 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5823                                            Register y, Register y_idx, Register z,
5824                                            Register carry, Register product,
5825                                            Register idx, Register kdx) {
5826   //
5827   //  jlong carry, x[], y[], z[];
5828   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5829   //    huge_128 product = y[idx] * x[xstart] + carry;
5830   //    z[kdx] = (jlong)product;
5831   //    carry  = (jlong)(product >>> 64);
5832   //  }
5833   //  z[xstart] = carry;
5834   //
5835 
5836   Label L_first_loop, L_first_loop_exit;
5837   Label L_one_x, L_one_y, L_multiply;
5838 
5839   decrementl(xstart);
5840   jcc(Assembler::negative, L_one_x);
5841 
5842   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
5843   rorq(x_xstart, 32); // convert big-endian to little-endian
5844 
5845   bind(L_first_loop);
5846   decrementl(idx);
5847   jcc(Assembler::negative, L_first_loop_exit);
5848   decrementl(idx);
5849   jcc(Assembler::negative, L_one_y);
5850   movq(y_idx, Address(y, idx, Address::times_4,  0));
5851   rorq(y_idx, 32); // convert big-endian to little-endian
5852   bind(L_multiply);
5853   movq(product, x_xstart);
5854   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5855   addq(product, carry);
5856   adcq(rdx, 0);
5857   subl(kdx, 2);
5858   movl(Address(z, kdx, Address::times_4,  4), product);
5859   shrq(product, 32);
5860   movl(Address(z, kdx, Address::times_4,  0), product);
5861   movq(carry, rdx);
5862   jmp(L_first_loop);
5863 
5864   bind(L_one_y);
5865   movl(y_idx, Address(y,  0));
5866   jmp(L_multiply);
5867 
5868   bind(L_one_x);
5869   movl(x_xstart, Address(x,  0));
5870   jmp(L_first_loop);
5871 
5872   bind(L_first_loop_exit);
5873 }
5874 
5875 /**
5876  * Multiply 64 bit by 64 bit and add 128 bit.
5877  */
5878 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5879                                             Register yz_idx, Register idx,
5880                                             Register carry, Register product, int offset) {
5881   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5882   //     z[kdx] = (jlong)product;
5883 
5884   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
5885   rorq(yz_idx, 32); // convert big-endian to little-endian
5886   movq(product, x_xstart);
5887   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
5888   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
5889   rorq(yz_idx, 32); // convert big-endian to little-endian
5890 
5891   add2_with_carry(rdx, product, carry, yz_idx);
5892 
5893   movl(Address(z, idx, Address::times_4,  offset+4), product);
5894   shrq(product, 32);
5895   movl(Address(z, idx, Address::times_4,  offset), product);
5896 
5897 }
5898 
5899 /**
5900  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5901  */
5902 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5903                                              Register yz_idx, Register idx, Register jdx,
5904                                              Register carry, Register product,
5905                                              Register carry2) {
5906   //   jlong carry, x[], y[], z[];
5907   //   int kdx = ystart+1;
5908   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5909   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5910   //     z[kdx+idx+1] = (jlong)product;
5911   //     jlong carry2  = (jlong)(product >>> 64);
5912   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5913   //     z[kdx+idx] = (jlong)product;
5914   //     carry  = (jlong)(product >>> 64);
5915   //   }
5916   //   idx += 2;
5917   //   if (idx > 0) {
5918   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5919   //     z[kdx+idx] = (jlong)product;
5920   //     carry  = (jlong)(product >>> 64);
5921   //   }
5922   //
5923 
5924   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5925 
5926   movl(jdx, idx);
5927   andl(jdx, 0xFFFFFFFC);
5928   shrl(jdx, 2);
5929 
5930   bind(L_third_loop);
5931   subl(jdx, 1);
5932   jcc(Assembler::negative, L_third_loop_exit);
5933   subl(idx, 4);
5934 
5935   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
5936   movq(carry2, rdx);
5937 
5938   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
5939   movq(carry, rdx);
5940   jmp(L_third_loop);
5941 
5942   bind (L_third_loop_exit);
5943 
5944   andl (idx, 0x3);
5945   jcc(Assembler::zero, L_post_third_loop_done);
5946 
5947   Label L_check_1;
5948   subl(idx, 2);
5949   jcc(Assembler::negative, L_check_1);
5950 
5951   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
5952   movq(carry, rdx);
5953 
5954   bind (L_check_1);
5955   addl (idx, 0x2);
5956   andl (idx, 0x1);
5957   subl(idx, 1);
5958   jcc(Assembler::negative, L_post_third_loop_done);
5959 
5960   movl(yz_idx, Address(y, idx, Address::times_4,  0));
5961   movq(product, x_xstart);
5962   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5963   movl(yz_idx, Address(z, idx, Address::times_4,  0));
5964 
5965   add2_with_carry(rdx, product, yz_idx, carry);
5966 
5967   movl(Address(z, idx, Address::times_4,  0), product);
5968   shrq(product, 32);
5969 
5970   shlq(rdx, 32);
5971   orq(product, rdx);
5972   movq(carry, product);
5973 
5974   bind(L_post_third_loop_done);
5975 }
5976 
5977 /**
5978  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
5979  *
5980  */
5981 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
5982                                                   Register carry, Register carry2,
5983                                                   Register idx, Register jdx,
5984                                                   Register yz_idx1, Register yz_idx2,
5985                                                   Register tmp, Register tmp3, Register tmp4) {
5986   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
5987 
5988   //   jlong carry, x[], y[], z[];
5989   //   int kdx = ystart+1;
5990   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5991   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
5992   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5993   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
5994   //     carry  = (jlong)(tmp4 >>> 64);
5995   //     z[kdx+idx+1] = (jlong)tmp3;
5996   //     z[kdx+idx] = (jlong)tmp4;
5997   //   }
5998   //   idx += 2;
5999   //   if (idx > 0) {
6000   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6001   //     z[kdx+idx] = (jlong)yz_idx1;
6002   //     carry  = (jlong)(yz_idx1 >>> 64);
6003   //   }
6004   //
6005 
6006   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6007 
6008   movl(jdx, idx);
6009   andl(jdx, 0xFFFFFFFC);
6010   shrl(jdx, 2);
6011 
6012   bind(L_third_loop);
6013   subl(jdx, 1);
6014   jcc(Assembler::negative, L_third_loop_exit);
6015   subl(idx, 4);
6016 
6017   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
6018   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6019   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
6020   rorxq(yz_idx2, yz_idx2, 32);
6021 
6022   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
6023   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
6024 
6025   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
6026   rorxq(yz_idx1, yz_idx1, 32);
6027   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6028   rorxq(yz_idx2, yz_idx2, 32);
6029 
6030   if (VM_Version::supports_adx()) {
6031     adcxq(tmp3, carry);
6032     adoxq(tmp3, yz_idx1);
6033 
6034     adcxq(tmp4, tmp);
6035     adoxq(tmp4, yz_idx2);
6036 
6037     movl(carry, 0); // does not affect flags
6038     adcxq(carry2, carry);
6039     adoxq(carry2, carry);
6040   } else {
6041     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6042     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6043   }
6044   movq(carry, carry2);
6045 
6046   movl(Address(z, idx, Address::times_4, 12), tmp3);
6047   shrq(tmp3, 32);
6048   movl(Address(z, idx, Address::times_4,  8), tmp3);
6049 
6050   movl(Address(z, idx, Address::times_4,  4), tmp4);
6051   shrq(tmp4, 32);
6052   movl(Address(z, idx, Address::times_4,  0), tmp4);
6053 
6054   jmp(L_third_loop);
6055 
6056   bind (L_third_loop_exit);
6057 
6058   andl (idx, 0x3);
6059   jcc(Assembler::zero, L_post_third_loop_done);
6060 
6061   Label L_check_1;
6062   subl(idx, 2);
6063   jcc(Assembler::negative, L_check_1);
6064 
6065   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
6066   rorxq(yz_idx1, yz_idx1, 32);
6067   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
6068   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6069   rorxq(yz_idx2, yz_idx2, 32);
6070 
6071   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6072 
6073   movl(Address(z, idx, Address::times_4,  4), tmp3);
6074   shrq(tmp3, 32);
6075   movl(Address(z, idx, Address::times_4,  0), tmp3);
6076   movq(carry, tmp4);
6077 
6078   bind (L_check_1);
6079   addl (idx, 0x2);
6080   andl (idx, 0x1);
6081   subl(idx, 1);
6082   jcc(Assembler::negative, L_post_third_loop_done);
6083   movl(tmp4, Address(y, idx, Address::times_4,  0));
6084   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
6085   movl(tmp4, Address(z, idx, Address::times_4,  0));
6086 
6087   add2_with_carry(carry2, tmp3, tmp4, carry);
6088 
6089   movl(Address(z, idx, Address::times_4,  0), tmp3);
6090   shrq(tmp3, 32);
6091 
6092   shlq(carry2, 32);
6093   orq(tmp3, carry2);
6094   movq(carry, tmp3);
6095 
6096   bind(L_post_third_loop_done);
6097 }
6098 
6099 /**
6100  * Code for BigInteger::multiplyToLen() instrinsic.
6101  *
6102  * rdi: x
6103  * rax: xlen
6104  * rsi: y
6105  * rcx: ylen
6106  * r8:  z
6107  * r11: zlen
6108  * r12: tmp1
6109  * r13: tmp2
6110  * r14: tmp3
6111  * r15: tmp4
6112  * rbx: tmp5
6113  *
6114  */
6115 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
6116                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
6117   ShortBranchVerifier sbv(this);
6118   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
6119 
6120   push(tmp1);
6121   push(tmp2);
6122   push(tmp3);
6123   push(tmp4);
6124   push(tmp5);
6125 
6126   push(xlen);
6127   push(zlen);
6128 
6129   const Register idx = tmp1;
6130   const Register kdx = tmp2;
6131   const Register xstart = tmp3;
6132 
6133   const Register y_idx = tmp4;
6134   const Register carry = tmp5;
6135   const Register product  = xlen;
6136   const Register x_xstart = zlen;  // reuse register
6137 
6138   // First Loop.
6139   //
6140   //  final static long LONG_MASK = 0xffffffffL;
6141   //  int xstart = xlen - 1;
6142   //  int ystart = ylen - 1;
6143   //  long carry = 0;
6144   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6145   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
6146   //    z[kdx] = (int)product;
6147   //    carry = product >>> 32;
6148   //  }
6149   //  z[xstart] = (int)carry;
6150   //
6151 
6152   movl(idx, ylen);      // idx = ylen;
6153   movl(kdx, zlen);      // kdx = xlen+ylen;
6154   xorq(carry, carry);   // carry = 0;
6155 
6156   Label L_done;
6157 
6158   movl(xstart, xlen);
6159   decrementl(xstart);
6160   jcc(Assembler::negative, L_done);
6161 
6162   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6163 
6164   Label L_second_loop;
6165   testl(kdx, kdx);
6166   jcc(Assembler::zero, L_second_loop);
6167 
6168   Label L_carry;
6169   subl(kdx, 1);
6170   jcc(Assembler::zero, L_carry);
6171 
6172   movl(Address(z, kdx, Address::times_4,  0), carry);
6173   shrq(carry, 32);
6174   subl(kdx, 1);
6175 
6176   bind(L_carry);
6177   movl(Address(z, kdx, Address::times_4,  0), carry);
6178 
6179   // Second and third (nested) loops.
6180   //
6181   // for (int i = xstart-1; i >= 0; i--) { // Second loop
6182   //   carry = 0;
6183   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6184   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6185   //                    (z[k] & LONG_MASK) + carry;
6186   //     z[k] = (int)product;
6187   //     carry = product >>> 32;
6188   //   }
6189   //   z[i] = (int)carry;
6190   // }
6191   //
6192   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
6193 
6194   const Register jdx = tmp1;
6195 
6196   bind(L_second_loop);
6197   xorl(carry, carry);    // carry = 0;
6198   movl(jdx, ylen);       // j = ystart+1
6199 
6200   subl(xstart, 1);       // i = xstart-1;
6201   jcc(Assembler::negative, L_done);
6202 
6203   push (z);
6204 
6205   Label L_last_x;
6206   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
6207   subl(xstart, 1);       // i = xstart-1;
6208   jcc(Assembler::negative, L_last_x);
6209 
6210   if (UseBMI2Instructions) {
6211     movq(rdx,  Address(x, xstart, Address::times_4,  0));
6212     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
6213   } else {
6214     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6215     rorq(x_xstart, 32);  // convert big-endian to little-endian
6216   }
6217 
6218   Label L_third_loop_prologue;
6219   bind(L_third_loop_prologue);
6220 
6221   push (x);
6222   push (xstart);
6223   push (ylen);
6224 
6225 
6226   if (UseBMI2Instructions) {
6227     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
6228   } else { // !UseBMI2Instructions
6229     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
6230   }
6231 
6232   pop(ylen);
6233   pop(xlen);
6234   pop(x);
6235   pop(z);
6236 
6237   movl(tmp3, xlen);
6238   addl(tmp3, 1);
6239   movl(Address(z, tmp3, Address::times_4,  0), carry);
6240   subl(tmp3, 1);
6241   jccb(Assembler::negative, L_done);
6242 
6243   shrq(carry, 32);
6244   movl(Address(z, tmp3, Address::times_4,  0), carry);
6245   jmp(L_second_loop);
6246 
6247   // Next infrequent code is moved outside loops.
6248   bind(L_last_x);
6249   if (UseBMI2Instructions) {
6250     movl(rdx, Address(x,  0));
6251   } else {
6252     movl(x_xstart, Address(x,  0));
6253   }
6254   jmp(L_third_loop_prologue);
6255 
6256   bind(L_done);
6257 
6258   pop(zlen);
6259   pop(xlen);
6260 
6261   pop(tmp5);
6262   pop(tmp4);
6263   pop(tmp3);
6264   pop(tmp2);
6265   pop(tmp1);
6266 }
6267 
6268 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
6269   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
6270   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
6271   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
6272   Label VECTOR8_TAIL, VECTOR4_TAIL;
6273   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
6274   Label SAME_TILL_END, DONE;
6275   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
6276 
6277   //scale is in rcx in both Win64 and Unix
6278   ShortBranchVerifier sbv(this);
6279 
6280   shlq(length);
6281   xorq(result, result);
6282 
6283   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
6284       VM_Version::supports_avx512vlbw()) {
6285     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
6286 
6287     cmpq(length, 64);
6288     jcc(Assembler::less, VECTOR32_TAIL);
6289 
6290     movq(tmp1, length);
6291     andq(tmp1, 0x3F);      // tail count
6292     andq(length, ~(0x3F)); //vector count
6293 
6294     bind(VECTOR64_LOOP);
6295     // AVX512 code to compare 64 byte vectors.
6296     evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit);
6297     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
6298     kortestql(k7, k7);
6299     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
6300     addq(result, 64);
6301     subq(length, 64);
6302     jccb(Assembler::notZero, VECTOR64_LOOP);
6303 
6304     //bind(VECTOR64_TAIL);
6305     testq(tmp1, tmp1);
6306     jcc(Assembler::zero, SAME_TILL_END);
6307 
6308     //bind(VECTOR64_TAIL);
6309     // AVX512 code to compare upto 63 byte vectors.
6310     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
6311     shlxq(tmp2, tmp2, tmp1);
6312     notq(tmp2);
6313     kmovql(k3, tmp2);
6314 
6315     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
6316     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
6317 
6318     ktestql(k7, k3);
6319     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
6320 
6321     bind(VECTOR64_NOT_EQUAL);
6322     kmovql(tmp1, k7);
6323     notq(tmp1);
6324     tzcntq(tmp1, tmp1);
6325     addq(result, tmp1);
6326     shrq(result);
6327     jmp(DONE);
6328     bind(VECTOR32_TAIL);
6329   }
6330 
6331   cmpq(length, 8);
6332   jcc(Assembler::equal, VECTOR8_LOOP);
6333   jcc(Assembler::less, VECTOR4_TAIL);
6334 
6335   if (UseAVX >= 2) {
6336     Label VECTOR16_TAIL, VECTOR32_LOOP;
6337 
6338     cmpq(length, 16);
6339     jcc(Assembler::equal, VECTOR16_LOOP);
6340     jcc(Assembler::less, VECTOR8_LOOP);
6341 
6342     cmpq(length, 32);
6343     jccb(Assembler::less, VECTOR16_TAIL);
6344 
6345     subq(length, 32);
6346     bind(VECTOR32_LOOP);
6347     vmovdqu(rymm0, Address(obja, result));
6348     vmovdqu(rymm1, Address(objb, result));
6349     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
6350     vptest(rymm2, rymm2);
6351     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6352     addq(result, 32);
6353     subq(length, 32);
6354     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6355     addq(length, 32);
6356     jcc(Assembler::equal, SAME_TILL_END);
6357     //falling through if less than 32 bytes left //close the branch here.
6358 
6359     bind(VECTOR16_TAIL);
6360     cmpq(length, 16);
6361     jccb(Assembler::less, VECTOR8_TAIL);
6362     bind(VECTOR16_LOOP);
6363     movdqu(rymm0, Address(obja, result));
6364     movdqu(rymm1, Address(objb, result));
6365     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6366     ptest(rymm2, rymm2);
6367     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6368     addq(result, 16);
6369     subq(length, 16);
6370     jcc(Assembler::equal, SAME_TILL_END);
6371     //falling through if less than 16 bytes left
6372   } else {//regular intrinsics
6373 
6374     cmpq(length, 16);
6375     jccb(Assembler::less, VECTOR8_TAIL);
6376 
6377     subq(length, 16);
6378     bind(VECTOR16_LOOP);
6379     movdqu(rymm0, Address(obja, result));
6380     movdqu(rymm1, Address(objb, result));
6381     pxor(rymm0, rymm1);
6382     ptest(rymm0, rymm0);
6383     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6384     addq(result, 16);
6385     subq(length, 16);
6386     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6387     addq(length, 16);
6388     jcc(Assembler::equal, SAME_TILL_END);
6389     //falling through if less than 16 bytes left
6390   }
6391 
6392   bind(VECTOR8_TAIL);
6393   cmpq(length, 8);
6394   jccb(Assembler::less, VECTOR4_TAIL);
6395   bind(VECTOR8_LOOP);
6396   movq(tmp1, Address(obja, result));
6397   movq(tmp2, Address(objb, result));
6398   xorq(tmp1, tmp2);
6399   testq(tmp1, tmp1);
6400   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6401   addq(result, 8);
6402   subq(length, 8);
6403   jcc(Assembler::equal, SAME_TILL_END);
6404   //falling through if less than 8 bytes left
6405 
6406   bind(VECTOR4_TAIL);
6407   cmpq(length, 4);
6408   jccb(Assembler::less, BYTES_TAIL);
6409   bind(VECTOR4_LOOP);
6410   movl(tmp1, Address(obja, result));
6411   xorl(tmp1, Address(objb, result));
6412   testl(tmp1, tmp1);
6413   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6414   addq(result, 4);
6415   subq(length, 4);
6416   jcc(Assembler::equal, SAME_TILL_END);
6417   //falling through if less than 4 bytes left
6418 
6419   bind(BYTES_TAIL);
6420   bind(BYTES_LOOP);
6421   load_unsigned_byte(tmp1, Address(obja, result));
6422   load_unsigned_byte(tmp2, Address(objb, result));
6423   xorl(tmp1, tmp2);
6424   testl(tmp1, tmp1);
6425   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6426   decq(length);
6427   jcc(Assembler::zero, SAME_TILL_END);
6428   incq(result);
6429   load_unsigned_byte(tmp1, Address(obja, result));
6430   load_unsigned_byte(tmp2, Address(objb, result));
6431   xorl(tmp1, tmp2);
6432   testl(tmp1, tmp1);
6433   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6434   decq(length);
6435   jcc(Assembler::zero, SAME_TILL_END);
6436   incq(result);
6437   load_unsigned_byte(tmp1, Address(obja, result));
6438   load_unsigned_byte(tmp2, Address(objb, result));
6439   xorl(tmp1, tmp2);
6440   testl(tmp1, tmp1);
6441   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6442   jmp(SAME_TILL_END);
6443 
6444   if (UseAVX >= 2) {
6445     bind(VECTOR32_NOT_EQUAL);
6446     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6447     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6448     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6449     vpmovmskb(tmp1, rymm0);
6450     bsfq(tmp1, tmp1);
6451     addq(result, tmp1);
6452     shrq(result);
6453     jmp(DONE);
6454   }
6455 
6456   bind(VECTOR16_NOT_EQUAL);
6457   if (UseAVX >= 2) {
6458     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6459     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6460     pxor(rymm0, rymm2);
6461   } else {
6462     pcmpeqb(rymm2, rymm2);
6463     pxor(rymm0, rymm1);
6464     pcmpeqb(rymm0, rymm1);
6465     pxor(rymm0, rymm2);
6466   }
6467   pmovmskb(tmp1, rymm0);
6468   bsfq(tmp1, tmp1);
6469   addq(result, tmp1);
6470   shrq(result);
6471   jmpb(DONE);
6472 
6473   bind(VECTOR8_NOT_EQUAL);
6474   bind(VECTOR4_NOT_EQUAL);
6475   bsfq(tmp1, tmp1);
6476   shrq(tmp1, 3);
6477   addq(result, tmp1);
6478   bind(BYTES_NOT_EQUAL);
6479   shrq(result);
6480   jmpb(DONE);
6481 
6482   bind(SAME_TILL_END);
6483   mov64(result, -1);
6484 
6485   bind(DONE);
6486 }
6487 
6488 //Helper functions for square_to_len()
6489 
6490 /**
6491  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6492  * Preserves x and z and modifies rest of the registers.
6493  */
6494 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6495   // Perform square and right shift by 1
6496   // Handle odd xlen case first, then for even xlen do the following
6497   // jlong carry = 0;
6498   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6499   //     huge_128 product = x[j:j+1] * x[j:j+1];
6500   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6501   //     z[i+2:i+3] = (jlong)(product >>> 1);
6502   //     carry = (jlong)product;
6503   // }
6504 
6505   xorq(tmp5, tmp5);     // carry
6506   xorq(rdxReg, rdxReg);
6507   xorl(tmp1, tmp1);     // index for x
6508   xorl(tmp4, tmp4);     // index for z
6509 
6510   Label L_first_loop, L_first_loop_exit;
6511 
6512   testl(xlen, 1);
6513   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6514 
6515   // Square and right shift by 1 the odd element using 32 bit multiply
6516   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6517   imulq(raxReg, raxReg);
6518   shrq(raxReg, 1);
6519   adcq(tmp5, 0);
6520   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6521   incrementl(tmp1);
6522   addl(tmp4, 2);
6523 
6524   // Square and  right shift by 1 the rest using 64 bit multiply
6525   bind(L_first_loop);
6526   cmpptr(tmp1, xlen);
6527   jccb(Assembler::equal, L_first_loop_exit);
6528 
6529   // Square
6530   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6531   rorq(raxReg, 32);    // convert big-endian to little-endian
6532   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6533 
6534   // Right shift by 1 and save carry
6535   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6536   rcrq(rdxReg, 1);
6537   rcrq(raxReg, 1);
6538   adcq(tmp5, 0);
6539 
6540   // Store result in z
6541   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6542   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6543 
6544   // Update indices for x and z
6545   addl(tmp1, 2);
6546   addl(tmp4, 4);
6547   jmp(L_first_loop);
6548 
6549   bind(L_first_loop_exit);
6550 }
6551 
6552 
6553 /**
6554  * Perform the following multiply add operation using BMI2 instructions
6555  * carry:sum = sum + op1*op2 + carry
6556  * op2 should be in rdx
6557  * op2 is preserved, all other registers are modified
6558  */
6559 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6560   // assert op2 is rdx
6561   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
6562   addq(sum, carry);
6563   adcq(tmp2, 0);
6564   addq(sum, op1);
6565   adcq(tmp2, 0);
6566   movq(carry, tmp2);
6567 }
6568 
6569 /**
6570  * Perform the following multiply add operation:
6571  * carry:sum = sum + op1*op2 + carry
6572  * Preserves op1, op2 and modifies rest of registers
6573  */
6574 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6575   // rdx:rax = op1 * op2
6576   movq(raxReg, op2);
6577   mulq(op1);
6578 
6579   //  rdx:rax = sum + carry + rdx:rax
6580   addq(sum, carry);
6581   adcq(rdxReg, 0);
6582   addq(sum, raxReg);
6583   adcq(rdxReg, 0);
6584 
6585   // carry:sum = rdx:sum
6586   movq(carry, rdxReg);
6587 }
6588 
6589 /**
6590  * Add 64 bit long carry into z[] with carry propogation.
6591  * Preserves z and carry register values and modifies rest of registers.
6592  *
6593  */
6594 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6595   Label L_fourth_loop, L_fourth_loop_exit;
6596 
6597   movl(tmp1, 1);
6598   subl(zlen, 2);
6599   addq(Address(z, zlen, Address::times_4, 0), carry);
6600 
6601   bind(L_fourth_loop);
6602   jccb(Assembler::carryClear, L_fourth_loop_exit);
6603   subl(zlen, 2);
6604   jccb(Assembler::negative, L_fourth_loop_exit);
6605   addq(Address(z, zlen, Address::times_4, 0), tmp1);
6606   jmp(L_fourth_loop);
6607   bind(L_fourth_loop_exit);
6608 }
6609 
6610 /**
6611  * Shift z[] left by 1 bit.
6612  * Preserves x, len, z and zlen registers and modifies rest of the registers.
6613  *
6614  */
6615 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6616 
6617   Label L_fifth_loop, L_fifth_loop_exit;
6618 
6619   // Fifth loop
6620   // Perform primitiveLeftShift(z, zlen, 1)
6621 
6622   const Register prev_carry = tmp1;
6623   const Register new_carry = tmp4;
6624   const Register value = tmp2;
6625   const Register zidx = tmp3;
6626 
6627   // int zidx, carry;
6628   // long value;
6629   // carry = 0;
6630   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6631   //    (carry:value)  = (z[i] << 1) | carry ;
6632   //    z[i] = value;
6633   // }
6634 
6635   movl(zidx, zlen);
6636   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6637 
6638   bind(L_fifth_loop);
6639   decl(zidx);  // Use decl to preserve carry flag
6640   decl(zidx);
6641   jccb(Assembler::negative, L_fifth_loop_exit);
6642 
6643   if (UseBMI2Instructions) {
6644      movq(value, Address(z, zidx, Address::times_4, 0));
6645      rclq(value, 1);
6646      rorxq(value, value, 32);
6647      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6648   }
6649   else {
6650     // clear new_carry
6651     xorl(new_carry, new_carry);
6652 
6653     // Shift z[i] by 1, or in previous carry and save new carry
6654     movq(value, Address(z, zidx, Address::times_4, 0));
6655     shlq(value, 1);
6656     adcl(new_carry, 0);
6657 
6658     orq(value, prev_carry);
6659     rorq(value, 0x20);
6660     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6661 
6662     // Set previous carry = new carry
6663     movl(prev_carry, new_carry);
6664   }
6665   jmp(L_fifth_loop);
6666 
6667   bind(L_fifth_loop_exit);
6668 }
6669 
6670 
6671 /**
6672  * Code for BigInteger::squareToLen() intrinsic
6673  *
6674  * rdi: x
6675  * rsi: len
6676  * r8:  z
6677  * rcx: zlen
6678  * r12: tmp1
6679  * r13: tmp2
6680  * r14: tmp3
6681  * r15: tmp4
6682  * rbx: tmp5
6683  *
6684  */
6685 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6686 
6687   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6688   push(tmp1);
6689   push(tmp2);
6690   push(tmp3);
6691   push(tmp4);
6692   push(tmp5);
6693 
6694   // First loop
6695   // Store the squares, right shifted one bit (i.e., divided by 2).
6696   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6697 
6698   // Add in off-diagonal sums.
6699   //
6700   // Second, third (nested) and fourth loops.
6701   // zlen +=2;
6702   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6703   //    carry = 0;
6704   //    long op2 = x[xidx:xidx+1];
6705   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6706   //       k -= 2;
6707   //       long op1 = x[j:j+1];
6708   //       long sum = z[k:k+1];
6709   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6710   //       z[k:k+1] = sum;
6711   //    }
6712   //    add_one_64(z, k, carry, tmp_regs);
6713   // }
6714 
6715   const Register carry = tmp5;
6716   const Register sum = tmp3;
6717   const Register op1 = tmp4;
6718   Register op2 = tmp2;
6719 
6720   push(zlen);
6721   push(len);
6722   addl(zlen,2);
6723   bind(L_second_loop);
6724   xorq(carry, carry);
6725   subl(zlen, 4);
6726   subl(len, 2);
6727   push(zlen);
6728   push(len);
6729   cmpl(len, 0);
6730   jccb(Assembler::lessEqual, L_second_loop_exit);
6731 
6732   // Multiply an array by one 64 bit long.
6733   if (UseBMI2Instructions) {
6734     op2 = rdxReg;
6735     movq(op2, Address(x, len, Address::times_4,  0));
6736     rorxq(op2, op2, 32);
6737   }
6738   else {
6739     movq(op2, Address(x, len, Address::times_4,  0));
6740     rorq(op2, 32);
6741   }
6742 
6743   bind(L_third_loop);
6744   decrementl(len);
6745   jccb(Assembler::negative, L_third_loop_exit);
6746   decrementl(len);
6747   jccb(Assembler::negative, L_last_x);
6748 
6749   movq(op1, Address(x, len, Address::times_4,  0));
6750   rorq(op1, 32);
6751 
6752   bind(L_multiply);
6753   subl(zlen, 2);
6754   movq(sum, Address(z, zlen, Address::times_4,  0));
6755 
6756   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6757   if (UseBMI2Instructions) {
6758     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6759   }
6760   else {
6761     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6762   }
6763 
6764   movq(Address(z, zlen, Address::times_4, 0), sum);
6765 
6766   jmp(L_third_loop);
6767   bind(L_third_loop_exit);
6768 
6769   // Fourth loop
6770   // Add 64 bit long carry into z with carry propogation.
6771   // Uses offsetted zlen.
6772   add_one_64(z, zlen, carry, tmp1);
6773 
6774   pop(len);
6775   pop(zlen);
6776   jmp(L_second_loop);
6777 
6778   // Next infrequent code is moved outside loops.
6779   bind(L_last_x);
6780   movl(op1, Address(x, 0));
6781   jmp(L_multiply);
6782 
6783   bind(L_second_loop_exit);
6784   pop(len);
6785   pop(zlen);
6786   pop(len);
6787   pop(zlen);
6788 
6789   // Fifth loop
6790   // Shift z left 1 bit.
6791   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6792 
6793   // z[zlen-1] |= x[len-1] & 1;
6794   movl(tmp3, Address(x, len, Address::times_4, -4));
6795   andl(tmp3, 1);
6796   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
6797 
6798   pop(tmp5);
6799   pop(tmp4);
6800   pop(tmp3);
6801   pop(tmp2);
6802   pop(tmp1);
6803 }
6804 
6805 /**
6806  * Helper function for mul_add()
6807  * Multiply the in[] by int k and add to out[] starting at offset offs using
6808  * 128 bit by 32 bit multiply and return the carry in tmp5.
6809  * Only quad int aligned length of in[] is operated on in this function.
6810  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6811  * This function preserves out, in and k registers.
6812  * len and offset point to the appropriate index in "in" & "out" correspondingly
6813  * tmp5 has the carry.
6814  * other registers are temporary and are modified.
6815  *
6816  */
6817 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6818   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6819   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6820 
6821   Label L_first_loop, L_first_loop_exit;
6822 
6823   movl(tmp1, len);
6824   shrl(tmp1, 2);
6825 
6826   bind(L_first_loop);
6827   subl(tmp1, 1);
6828   jccb(Assembler::negative, L_first_loop_exit);
6829 
6830   subl(len, 4);
6831   subl(offset, 4);
6832 
6833   Register op2 = tmp2;
6834   const Register sum = tmp3;
6835   const Register op1 = tmp4;
6836   const Register carry = tmp5;
6837 
6838   if (UseBMI2Instructions) {
6839     op2 = rdxReg;
6840   }
6841 
6842   movq(op1, Address(in, len, Address::times_4,  8));
6843   rorq(op1, 32);
6844   movq(sum, Address(out, offset, Address::times_4,  8));
6845   rorq(sum, 32);
6846   if (UseBMI2Instructions) {
6847     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6848   }
6849   else {
6850     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6851   }
6852   // Store back in big endian from little endian
6853   rorq(sum, 0x20);
6854   movq(Address(out, offset, Address::times_4,  8), sum);
6855 
6856   movq(op1, Address(in, len, Address::times_4,  0));
6857   rorq(op1, 32);
6858   movq(sum, Address(out, offset, Address::times_4,  0));
6859   rorq(sum, 32);
6860   if (UseBMI2Instructions) {
6861     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6862   }
6863   else {
6864     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6865   }
6866   // Store back in big endian from little endian
6867   rorq(sum, 0x20);
6868   movq(Address(out, offset, Address::times_4,  0), sum);
6869 
6870   jmp(L_first_loop);
6871   bind(L_first_loop_exit);
6872 }
6873 
6874 /**
6875  * Code for BigInteger::mulAdd() intrinsic
6876  *
6877  * rdi: out
6878  * rsi: in
6879  * r11: offs (out.length - offset)
6880  * rcx: len
6881  * r8:  k
6882  * r12: tmp1
6883  * r13: tmp2
6884  * r14: tmp3
6885  * r15: tmp4
6886  * rbx: tmp5
6887  * Multiply the in[] by word k and add to out[], return the carry in rax
6888  */
6889 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6890    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6891    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6892 
6893   Label L_carry, L_last_in, L_done;
6894 
6895 // carry = 0;
6896 // for (int j=len-1; j >= 0; j--) {
6897 //    long product = (in[j] & LONG_MASK) * kLong +
6898 //                   (out[offs] & LONG_MASK) + carry;
6899 //    out[offs--] = (int)product;
6900 //    carry = product >>> 32;
6901 // }
6902 //
6903   push(tmp1);
6904   push(tmp2);
6905   push(tmp3);
6906   push(tmp4);
6907   push(tmp5);
6908 
6909   Register op2 = tmp2;
6910   const Register sum = tmp3;
6911   const Register op1 = tmp4;
6912   const Register carry =  tmp5;
6913 
6914   if (UseBMI2Instructions) {
6915     op2 = rdxReg;
6916     movl(op2, k);
6917   }
6918   else {
6919     movl(op2, k);
6920   }
6921 
6922   xorq(carry, carry);
6923 
6924   //First loop
6925 
6926   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
6927   //The carry is in tmp5
6928   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
6929 
6930   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
6931   decrementl(len);
6932   jccb(Assembler::negative, L_carry);
6933   decrementl(len);
6934   jccb(Assembler::negative, L_last_in);
6935 
6936   movq(op1, Address(in, len, Address::times_4,  0));
6937   rorq(op1, 32);
6938 
6939   subl(offs, 2);
6940   movq(sum, Address(out, offs, Address::times_4,  0));
6941   rorq(sum, 32);
6942 
6943   if (UseBMI2Instructions) {
6944     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6945   }
6946   else {
6947     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6948   }
6949 
6950   // Store back in big endian from little endian
6951   rorq(sum, 0x20);
6952   movq(Address(out, offs, Address::times_4,  0), sum);
6953 
6954   testl(len, len);
6955   jccb(Assembler::zero, L_carry);
6956 
6957   //Multiply the last in[] entry, if any
6958   bind(L_last_in);
6959   movl(op1, Address(in, 0));
6960   movl(sum, Address(out, offs, Address::times_4,  -4));
6961 
6962   movl(raxReg, k);
6963   mull(op1); //tmp4 * eax -> edx:eax
6964   addl(sum, carry);
6965   adcl(rdxReg, 0);
6966   addl(sum, raxReg);
6967   adcl(rdxReg, 0);
6968   movl(carry, rdxReg);
6969 
6970   movl(Address(out, offs, Address::times_4,  -4), sum);
6971 
6972   bind(L_carry);
6973   //return tmp5/carry as carry in rax
6974   movl(rax, carry);
6975 
6976   bind(L_done);
6977   pop(tmp5);
6978   pop(tmp4);
6979   pop(tmp3);
6980   pop(tmp2);
6981   pop(tmp1);
6982 }
6983 #endif
6984 
6985 /**
6986  * Emits code to update CRC-32 with a byte value according to constants in table
6987  *
6988  * @param [in,out]crc   Register containing the crc.
6989  * @param [in]val       Register containing the byte to fold into the CRC.
6990  * @param [in]table     Register containing the table of crc constants.
6991  *
6992  * uint32_t crc;
6993  * val = crc_table[(val ^ crc) & 0xFF];
6994  * crc = val ^ (crc >> 8);
6995  *
6996  */
6997 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
6998   xorl(val, crc);
6999   andl(val, 0xFF);
7000   shrl(crc, 8); // unsigned shift
7001   xorl(crc, Address(table, val, Address::times_4, 0));
7002 }
7003 
7004 /**
7005  * Fold 128-bit data chunk
7006  */
7007 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7008   if (UseAVX > 0) {
7009     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7010     vpclmulldq(xcrc, xK, xcrc); // [63:0]
7011     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7012     pxor(xcrc, xtmp);
7013   } else {
7014     movdqa(xtmp, xcrc);
7015     pclmulhdq(xtmp, xK);   // [123:64]
7016     pclmulldq(xcrc, xK);   // [63:0]
7017     pxor(xcrc, xtmp);
7018     movdqu(xtmp, Address(buf, offset));
7019     pxor(xcrc, xtmp);
7020   }
7021 }
7022 
7023 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7024   if (UseAVX > 0) {
7025     vpclmulhdq(xtmp, xK, xcrc);
7026     vpclmulldq(xcrc, xK, xcrc);
7027     pxor(xcrc, xbuf);
7028     pxor(xcrc, xtmp);
7029   } else {
7030     movdqa(xtmp, xcrc);
7031     pclmulhdq(xtmp, xK);
7032     pclmulldq(xcrc, xK);
7033     pxor(xcrc, xbuf);
7034     pxor(xcrc, xtmp);
7035   }
7036 }
7037 
7038 /**
7039  * 8-bit folds to compute 32-bit CRC
7040  *
7041  * uint64_t xcrc;
7042  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7043  */
7044 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7045   movdl(tmp, xcrc);
7046   andl(tmp, 0xFF);
7047   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7048   psrldq(xcrc, 1); // unsigned shift one byte
7049   pxor(xcrc, xtmp);
7050 }
7051 
7052 /**
7053  * uint32_t crc;
7054  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7055  */
7056 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7057   movl(tmp, crc);
7058   andl(tmp, 0xFF);
7059   shrl(crc, 8);
7060   xorl(crc, Address(table, tmp, Address::times_4, 0));
7061 }
7062 
7063 /**
7064  * @param crc   register containing existing CRC (32-bit)
7065  * @param buf   register pointing to input byte buffer (byte*)
7066  * @param len   register containing number of bytes
7067  * @param table register that will contain address of CRC table
7068  * @param tmp   scratch register
7069  */
7070 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7071   assert_different_registers(crc, buf, len, table, tmp, rax);
7072 
7073   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7074   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7075 
7076   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7077   // context for the registers used, where all instructions below are using 128-bit mode
7078   // On EVEX without VL and BW, these instructions will all be AVX.
7079   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7080   notl(crc); // ~crc
7081   cmpl(len, 16);
7082   jcc(Assembler::less, L_tail);
7083 
7084   // Align buffer to 16 bytes
7085   movl(tmp, buf);
7086   andl(tmp, 0xF);
7087   jccb(Assembler::zero, L_aligned);
7088   subl(tmp,  16);
7089   addl(len, tmp);
7090 
7091   align(4);
7092   BIND(L_align_loop);
7093   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7094   update_byte_crc32(crc, rax, table);
7095   increment(buf);
7096   incrementl(tmp);
7097   jccb(Assembler::less, L_align_loop);
7098 
7099   BIND(L_aligned);
7100   movl(tmp, len); // save
7101   shrl(len, 4);
7102   jcc(Assembler::zero, L_tail_restore);
7103 
7104   // Fold crc into first bytes of vector
7105   movdqa(xmm1, Address(buf, 0));
7106   movdl(rax, xmm1);
7107   xorl(crc, rax);
7108   if (VM_Version::supports_sse4_1()) {
7109     pinsrd(xmm1, crc, 0);
7110   } else {
7111     pinsrw(xmm1, crc, 0);
7112     shrl(crc, 16);
7113     pinsrw(xmm1, crc, 1);
7114   }
7115   addptr(buf, 16);
7116   subl(len, 4); // len > 0
7117   jcc(Assembler::less, L_fold_tail);
7118 
7119   movdqa(xmm2, Address(buf,  0));
7120   movdqa(xmm3, Address(buf, 16));
7121   movdqa(xmm4, Address(buf, 32));
7122   addptr(buf, 48);
7123   subl(len, 3);
7124   jcc(Assembler::lessEqual, L_fold_512b);
7125 
7126   // Fold total 512 bits of polynomial on each iteration,
7127   // 128 bits per each of 4 parallel streams.
7128   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7129 
7130   align32();
7131   BIND(L_fold_512b_loop);
7132   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7133   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7134   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7135   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7136   addptr(buf, 64);
7137   subl(len, 4);
7138   jcc(Assembler::greater, L_fold_512b_loop);
7139 
7140   // Fold 512 bits to 128 bits.
7141   BIND(L_fold_512b);
7142   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7143   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7144   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7145   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7146 
7147   // Fold the rest of 128 bits data chunks
7148   BIND(L_fold_tail);
7149   addl(len, 3);
7150   jccb(Assembler::lessEqual, L_fold_128b);
7151   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7152 
7153   BIND(L_fold_tail_loop);
7154   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
7155   addptr(buf, 16);
7156   decrementl(len);
7157   jccb(Assembler::greater, L_fold_tail_loop);
7158 
7159   // Fold 128 bits in xmm1 down into 32 bits in crc register.
7160   BIND(L_fold_128b);
7161   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7162   if (UseAVX > 0) {
7163     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7164     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
7165     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7166   } else {
7167     movdqa(xmm2, xmm0);
7168     pclmulqdq(xmm2, xmm1, 0x1);
7169     movdqa(xmm3, xmm0);
7170     pand(xmm3, xmm2);
7171     pclmulqdq(xmm0, xmm3, 0x1);
7172   }
7173   psrldq(xmm1, 8);
7174   psrldq(xmm2, 4);
7175   pxor(xmm0, xmm1);
7176   pxor(xmm0, xmm2);
7177 
7178   // 8 8-bit folds to compute 32-bit CRC.
7179   for (int j = 0; j < 4; j++) {
7180     fold_8bit_crc32(xmm0, table, xmm1, rax);
7181   }
7182   movdl(crc, xmm0); // mov 32 bits to general register
7183   for (int j = 0; j < 4; j++) {
7184     fold_8bit_crc32(crc, table, rax);
7185   }
7186 
7187   BIND(L_tail_restore);
7188   movl(len, tmp); // restore
7189   BIND(L_tail);
7190   andl(len, 0xf);
7191   jccb(Assembler::zero, L_exit);
7192 
7193   // Fold the rest of bytes
7194   align(4);
7195   BIND(L_tail_loop);
7196   movsbl(rax, Address(buf, 0)); // load byte with sign extension
7197   update_byte_crc32(crc, rax, table);
7198   increment(buf);
7199   decrementl(len);
7200   jccb(Assembler::greater, L_tail_loop);
7201 
7202   BIND(L_exit);
7203   notl(crc); // ~c
7204 }
7205 
7206 #ifdef _LP64
7207 // Helper function for AVX 512 CRC32
7208 // Fold 512-bit data chunks
7209 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
7210                                              Register pos, int offset) {
7211   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
7212   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
7213   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
7214   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
7215   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
7216 }
7217 
7218 // Helper function for AVX 512 CRC32
7219 // Compute CRC32 for < 256B buffers
7220 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
7221                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7222                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7223 
7224   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
7225   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
7226   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
7227 
7228   // check if there is enough buffer to be able to fold 16B at a time
7229   cmpl(len, 32);
7230   jcc(Assembler::less, L_less_than_32);
7231 
7232   // if there is, load the constants
7233   movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
7234   movdl(xmm0, crc);                        // get the initial crc value
7235   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7236   pxor(xmm7, xmm0);
7237 
7238   // update the buffer pointer
7239   addl(pos, 16);
7240   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
7241   subl(len, 32);
7242   jmp(L_16B_reduction_loop);
7243 
7244   bind(L_less_than_32);
7245   //mov initial crc to the return value. this is necessary for zero - length buffers.
7246   movl(rax, crc);
7247   testl(len, len);
7248   jcc(Assembler::equal, L_cleanup);
7249 
7250   movdl(xmm0, crc);                        //get the initial crc value
7251 
7252   cmpl(len, 16);
7253   jcc(Assembler::equal, L_exact_16_left);
7254   jcc(Assembler::less, L_less_than_16_left);
7255 
7256   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
7257   pxor(xmm7, xmm0);                       //xor the initial crc value
7258   addl(pos, 16);
7259   subl(len, 16);
7260   movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
7261   jmp(L_get_last_two_xmms);
7262 
7263   bind(L_less_than_16_left);
7264   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
7265   pxor(xmm1, xmm1);
7266   movptr(tmp1, rsp);
7267   movdqu(Address(tmp1, 0 * 16), xmm1);
7268 
7269   cmpl(len, 4);
7270   jcc(Assembler::less, L_only_less_than_4);
7271 
7272   //backup the counter value
7273   movl(tmp2, len);
7274   cmpl(len, 8);
7275   jcc(Assembler::less, L_less_than_8_left);
7276 
7277   //load 8 Bytes
7278   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
7279   movq(Address(tmp1, 0 * 16), rax);
7280   addptr(tmp1, 8);
7281   subl(len, 8);
7282   addl(pos, 8);
7283 
7284   bind(L_less_than_8_left);
7285   cmpl(len, 4);
7286   jcc(Assembler::less, L_less_than_4_left);
7287 
7288   //load 4 Bytes
7289   movl(rax, Address(buf, pos, Address::times_1, 0));
7290   movl(Address(tmp1, 0 * 16), rax);
7291   addptr(tmp1, 4);
7292   subl(len, 4);
7293   addl(pos, 4);
7294 
7295   bind(L_less_than_4_left);
7296   cmpl(len, 2);
7297   jcc(Assembler::less, L_less_than_2_left);
7298 
7299   // load 2 Bytes
7300   movw(rax, Address(buf, pos, Address::times_1, 0));
7301   movl(Address(tmp1, 0 * 16), rax);
7302   addptr(tmp1, 2);
7303   subl(len, 2);
7304   addl(pos, 2);
7305 
7306   bind(L_less_than_2_left);
7307   cmpl(len, 1);
7308   jcc(Assembler::less, L_zero_left);
7309 
7310   // load 1 Byte
7311   movb(rax, Address(buf, pos, Address::times_1, 0));
7312   movb(Address(tmp1, 0 * 16), rax);
7313 
7314   bind(L_zero_left);
7315   movdqu(xmm7, Address(rsp, 0));
7316   pxor(xmm7, xmm0);                       //xor the initial crc value
7317 
7318   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7319   movdqu(xmm0, Address(rax, tmp2));
7320   pshufb(xmm7, xmm0);
7321   jmp(L_128_done);
7322 
7323   bind(L_exact_16_left);
7324   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
7325   pxor(xmm7, xmm0);                       //xor the initial crc value
7326   jmp(L_128_done);
7327 
7328   bind(L_only_less_than_4);
7329   cmpl(len, 3);
7330   jcc(Assembler::less, L_only_less_than_3);
7331 
7332   // load 3 Bytes
7333   movb(rax, Address(buf, pos, Address::times_1, 0));
7334   movb(Address(tmp1, 0), rax);
7335 
7336   movb(rax, Address(buf, pos, Address::times_1, 1));
7337   movb(Address(tmp1, 1), rax);
7338 
7339   movb(rax, Address(buf, pos, Address::times_1, 2));
7340   movb(Address(tmp1, 2), rax);
7341 
7342   movdqu(xmm7, Address(rsp, 0));
7343   pxor(xmm7, xmm0);                     //xor the initial crc value
7344 
7345   pslldq(xmm7, 0x5);
7346   jmp(L_barrett);
7347   bind(L_only_less_than_3);
7348   cmpl(len, 2);
7349   jcc(Assembler::less, L_only_less_than_2);
7350 
7351   // load 2 Bytes
7352   movb(rax, Address(buf, pos, Address::times_1, 0));
7353   movb(Address(tmp1, 0), rax);
7354 
7355   movb(rax, Address(buf, pos, Address::times_1, 1));
7356   movb(Address(tmp1, 1), rax);
7357 
7358   movdqu(xmm7, Address(rsp, 0));
7359   pxor(xmm7, xmm0);                     //xor the initial crc value
7360 
7361   pslldq(xmm7, 0x6);
7362   jmp(L_barrett);
7363 
7364   bind(L_only_less_than_2);
7365   //load 1 Byte
7366   movb(rax, Address(buf, pos, Address::times_1, 0));
7367   movb(Address(tmp1, 0), rax);
7368 
7369   movdqu(xmm7, Address(rsp, 0));
7370   pxor(xmm7, xmm0);                     //xor the initial crc value
7371 
7372   pslldq(xmm7, 0x7);
7373 }
7374 
7375 /**
7376 * Compute CRC32 using AVX512 instructions
7377 * param crc   register containing existing CRC (32-bit)
7378 * param buf   register pointing to input byte buffer (byte*)
7379 * param len   register containing number of bytes
7380 * param table address of crc or crc32c table
7381 * param tmp1  scratch register
7382 * param tmp2  scratch register
7383 * return rax  result register
7384 *
7385 * This routine is identical for crc32c with the exception of the precomputed constant
7386 * table which will be passed as the table argument.  The calculation steps are
7387 * the same for both variants.
7388 */
7389 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7390   assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
7391 
7392   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7393   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7394   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7395   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7396   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7397 
7398   const Register pos = r12;
7399   push(r12);
7400   subptr(rsp, 16 * 2 + 8);
7401 
7402   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7403   // context for the registers used, where all instructions below are using 128-bit mode
7404   // On EVEX without VL and BW, these instructions will all be AVX.
7405   movl(pos, 0);
7406 
7407   // check if smaller than 256B
7408   cmpl(len, 256);
7409   jcc(Assembler::less, L_less_than_256);
7410 
7411   // load the initial crc value
7412   movdl(xmm10, crc);
7413 
7414   // receive the initial 64B data, xor the initial crc value
7415   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7416   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7417   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7418   evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7419 
7420   subl(len, 256);
7421   cmpl(len, 256);
7422   jcc(Assembler::less, L_fold_128_B_loop);
7423 
7424   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7425   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7426   evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7427   subl(len, 256);
7428 
7429   bind(L_fold_256_B_loop);
7430   addl(pos, 256);
7431   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7432   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7433   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7434   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7435 
7436   subl(len, 256);
7437   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7438 
7439   // Fold 256 into 128
7440   addl(pos, 256);
7441   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7442   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7443   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7444 
7445   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7446   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7447   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7448 
7449   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7450   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7451 
7452   addl(len, 128);
7453   jmp(L_fold_128_B_register);
7454 
7455   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7456   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7457 
7458   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7459   bind(L_fold_128_B_loop);
7460   addl(pos, 128);
7461   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7462   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7463 
7464   subl(len, 128);
7465   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7466 
7467   addl(pos, 128);
7468 
7469   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7470   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7471   bind(L_fold_128_B_register);
7472   evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7473   evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7474   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7475   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7476   // save last that has no multiplicand
7477   vextracti64x2(xmm7, xmm4, 3);
7478 
7479   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7480   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7481   // Needed later in reduction loop
7482   movdqu(xmm10, Address(table, 1 * 16));
7483   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7484   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7485 
7486   // Swap 1,0,3,2 - 01 00 11 10
7487   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7488   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7489   vextracti128(xmm5, xmm8, 1);
7490   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7491 
7492   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7493   // instead of a cmp instruction, we use the negative flag with the jl instruction
7494   addl(len, 128 - 16);
7495   jcc(Assembler::less, L_final_reduction_for_128);
7496 
7497   bind(L_16B_reduction_loop);
7498   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7499   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7500   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7501   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7502   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7503   addl(pos, 16);
7504   subl(len, 16);
7505   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7506 
7507   bind(L_final_reduction_for_128);
7508   addl(len, 16);
7509   jcc(Assembler::equal, L_128_done);
7510 
7511   bind(L_get_last_two_xmms);
7512   movdqu(xmm2, xmm7);
7513   addl(pos, len);
7514   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7515   subl(pos, len);
7516 
7517   // get rid of the extra data that was loaded before
7518   // load the shift constant
7519   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7520   movdqu(xmm0, Address(rax, len));
7521   addl(rax, len);
7522 
7523   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7524   //Change mask to 512
7525   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7526   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7527 
7528   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7529   vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
7530   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7531   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7532   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7533 
7534   bind(L_128_done);
7535   // compute crc of a 128-bit value
7536   movdqu(xmm10, Address(table, 3 * 16));
7537   movdqu(xmm0, xmm7);
7538 
7539   // 64b fold
7540   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7541   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7542   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7543 
7544   // 32b fold
7545   movdqu(xmm0, xmm7);
7546   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7547   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7548   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7549   jmp(L_barrett);
7550 
7551   bind(L_less_than_256);
7552   kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7553 
7554   //barrett reduction
7555   bind(L_barrett);
7556   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7557   movdqu(xmm1, xmm7);
7558   movdqu(xmm2, xmm7);
7559   movdqu(xmm10, Address(table, 4 * 16));
7560 
7561   pclmulqdq(xmm7, xmm10, 0x0);
7562   pxor(xmm7, xmm2);
7563   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7564   movdqu(xmm2, xmm7);
7565   pclmulqdq(xmm7, xmm10, 0x10);
7566   pxor(xmm7, xmm2);
7567   pxor(xmm7, xmm1);
7568   pextrd(crc, xmm7, 2);
7569 
7570   bind(L_cleanup);
7571   addptr(rsp, 16 * 2 + 8);
7572   pop(r12);
7573 }
7574 
7575 // S. Gueron / Information Processing Letters 112 (2012) 184
7576 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7577 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7578 // Output: the 64-bit carry-less product of B * CONST
7579 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7580                                      Register tmp1, Register tmp2, Register tmp3) {
7581   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7582   if (n > 0) {
7583     addq(tmp3, n * 256 * 8);
7584   }
7585   //    Q1 = TABLEExt[n][B & 0xFF];
7586   movl(tmp1, in);
7587   andl(tmp1, 0x000000FF);
7588   shll(tmp1, 3);
7589   addq(tmp1, tmp3);
7590   movq(tmp1, Address(tmp1, 0));
7591 
7592   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7593   movl(tmp2, in);
7594   shrl(tmp2, 8);
7595   andl(tmp2, 0x000000FF);
7596   shll(tmp2, 3);
7597   addq(tmp2, tmp3);
7598   movq(tmp2, Address(tmp2, 0));
7599 
7600   shlq(tmp2, 8);
7601   xorq(tmp1, tmp2);
7602 
7603   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7604   movl(tmp2, in);
7605   shrl(tmp2, 16);
7606   andl(tmp2, 0x000000FF);
7607   shll(tmp2, 3);
7608   addq(tmp2, tmp3);
7609   movq(tmp2, Address(tmp2, 0));
7610 
7611   shlq(tmp2, 16);
7612   xorq(tmp1, tmp2);
7613 
7614   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7615   shrl(in, 24);
7616   andl(in, 0x000000FF);
7617   shll(in, 3);
7618   addq(in, tmp3);
7619   movq(in, Address(in, 0));
7620 
7621   shlq(in, 24);
7622   xorq(in, tmp1);
7623   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7624 }
7625 
7626 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7627                                       Register in_out,
7628                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7629                                       XMMRegister w_xtmp2,
7630                                       Register tmp1,
7631                                       Register n_tmp2, Register n_tmp3) {
7632   if (is_pclmulqdq_supported) {
7633     movdl(w_xtmp1, in_out); // modified blindly
7634 
7635     movl(tmp1, const_or_pre_comp_const_index);
7636     movdl(w_xtmp2, tmp1);
7637     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7638 
7639     movdq(in_out, w_xtmp1);
7640   } else {
7641     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7642   }
7643 }
7644 
7645 // Recombination Alternative 2: No bit-reflections
7646 // T1 = (CRC_A * U1) << 1
7647 // T2 = (CRC_B * U2) << 1
7648 // C1 = T1 >> 32
7649 // C2 = T2 >> 32
7650 // T1 = T1 & 0xFFFFFFFF
7651 // T2 = T2 & 0xFFFFFFFF
7652 // T1 = CRC32(0, T1)
7653 // T2 = CRC32(0, T2)
7654 // C1 = C1 ^ T1
7655 // C2 = C2 ^ T2
7656 // CRC = C1 ^ C2 ^ CRC_C
7657 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7658                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7659                                      Register tmp1, Register tmp2,
7660                                      Register n_tmp3) {
7661   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7662   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7663   shlq(in_out, 1);
7664   movl(tmp1, in_out);
7665   shrq(in_out, 32);
7666   xorl(tmp2, tmp2);
7667   crc32(tmp2, tmp1, 4);
7668   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7669   shlq(in1, 1);
7670   movl(tmp1, in1);
7671   shrq(in1, 32);
7672   xorl(tmp2, tmp2);
7673   crc32(tmp2, tmp1, 4);
7674   xorl(in1, tmp2);
7675   xorl(in_out, in1);
7676   xorl(in_out, in2);
7677 }
7678 
7679 // Set N to predefined value
7680 // Subtract from a lenght of a buffer
7681 // execute in a loop:
7682 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7683 // for i = 1 to N do
7684 //  CRC_A = CRC32(CRC_A, A[i])
7685 //  CRC_B = CRC32(CRC_B, B[i])
7686 //  CRC_C = CRC32(CRC_C, C[i])
7687 // end for
7688 // Recombine
7689 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7690                                        Register in_out1, Register in_out2, Register in_out3,
7691                                        Register tmp1, Register tmp2, Register tmp3,
7692                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7693                                        Register tmp4, Register tmp5,
7694                                        Register n_tmp6) {
7695   Label L_processPartitions;
7696   Label L_processPartition;
7697   Label L_exit;
7698 
7699   bind(L_processPartitions);
7700   cmpl(in_out1, 3 * size);
7701   jcc(Assembler::less, L_exit);
7702     xorl(tmp1, tmp1);
7703     xorl(tmp2, tmp2);
7704     movq(tmp3, in_out2);
7705     addq(tmp3, size);
7706 
7707     bind(L_processPartition);
7708       crc32(in_out3, Address(in_out2, 0), 8);
7709       crc32(tmp1, Address(in_out2, size), 8);
7710       crc32(tmp2, Address(in_out2, size * 2), 8);
7711       addq(in_out2, 8);
7712       cmpq(in_out2, tmp3);
7713       jcc(Assembler::less, L_processPartition);
7714     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7715             w_xtmp1, w_xtmp2, w_xtmp3,
7716             tmp4, tmp5,
7717             n_tmp6);
7718     addq(in_out2, 2 * size);
7719     subl(in_out1, 3 * size);
7720     jmp(L_processPartitions);
7721 
7722   bind(L_exit);
7723 }
7724 #else
7725 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7726                                      Register tmp1, Register tmp2, Register tmp3,
7727                                      XMMRegister xtmp1, XMMRegister xtmp2) {
7728   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7729   if (n > 0) {
7730     addl(tmp3, n * 256 * 8);
7731   }
7732   //    Q1 = TABLEExt[n][B & 0xFF];
7733   movl(tmp1, in_out);
7734   andl(tmp1, 0x000000FF);
7735   shll(tmp1, 3);
7736   addl(tmp1, tmp3);
7737   movq(xtmp1, Address(tmp1, 0));
7738 
7739   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7740   movl(tmp2, in_out);
7741   shrl(tmp2, 8);
7742   andl(tmp2, 0x000000FF);
7743   shll(tmp2, 3);
7744   addl(tmp2, tmp3);
7745   movq(xtmp2, Address(tmp2, 0));
7746 
7747   psllq(xtmp2, 8);
7748   pxor(xtmp1, xtmp2);
7749 
7750   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7751   movl(tmp2, in_out);
7752   shrl(tmp2, 16);
7753   andl(tmp2, 0x000000FF);
7754   shll(tmp2, 3);
7755   addl(tmp2, tmp3);
7756   movq(xtmp2, Address(tmp2, 0));
7757 
7758   psllq(xtmp2, 16);
7759   pxor(xtmp1, xtmp2);
7760 
7761   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7762   shrl(in_out, 24);
7763   andl(in_out, 0x000000FF);
7764   shll(in_out, 3);
7765   addl(in_out, tmp3);
7766   movq(xtmp2, Address(in_out, 0));
7767 
7768   psllq(xtmp2, 24);
7769   pxor(xtmp1, xtmp2); // Result in CXMM
7770   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7771 }
7772 
7773 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7774                                       Register in_out,
7775                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7776                                       XMMRegister w_xtmp2,
7777                                       Register tmp1,
7778                                       Register n_tmp2, Register n_tmp3) {
7779   if (is_pclmulqdq_supported) {
7780     movdl(w_xtmp1, in_out);
7781 
7782     movl(tmp1, const_or_pre_comp_const_index);
7783     movdl(w_xtmp2, tmp1);
7784     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7785     // Keep result in XMM since GPR is 32 bit in length
7786   } else {
7787     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7788   }
7789 }
7790 
7791 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7792                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7793                                      Register tmp1, Register tmp2,
7794                                      Register n_tmp3) {
7795   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7796   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7797 
7798   psllq(w_xtmp1, 1);
7799   movdl(tmp1, w_xtmp1);
7800   psrlq(w_xtmp1, 32);
7801   movdl(in_out, w_xtmp1);
7802 
7803   xorl(tmp2, tmp2);
7804   crc32(tmp2, tmp1, 4);
7805   xorl(in_out, tmp2);
7806 
7807   psllq(w_xtmp2, 1);
7808   movdl(tmp1, w_xtmp2);
7809   psrlq(w_xtmp2, 32);
7810   movdl(in1, w_xtmp2);
7811 
7812   xorl(tmp2, tmp2);
7813   crc32(tmp2, tmp1, 4);
7814   xorl(in1, tmp2);
7815   xorl(in_out, in1);
7816   xorl(in_out, in2);
7817 }
7818 
7819 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7820                                        Register in_out1, Register in_out2, Register in_out3,
7821                                        Register tmp1, Register tmp2, Register tmp3,
7822                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7823                                        Register tmp4, Register tmp5,
7824                                        Register n_tmp6) {
7825   Label L_processPartitions;
7826   Label L_processPartition;
7827   Label L_exit;
7828 
7829   bind(L_processPartitions);
7830   cmpl(in_out1, 3 * size);
7831   jcc(Assembler::less, L_exit);
7832     xorl(tmp1, tmp1);
7833     xorl(tmp2, tmp2);
7834     movl(tmp3, in_out2);
7835     addl(tmp3, size);
7836 
7837     bind(L_processPartition);
7838       crc32(in_out3, Address(in_out2, 0), 4);
7839       crc32(tmp1, Address(in_out2, size), 4);
7840       crc32(tmp2, Address(in_out2, size*2), 4);
7841       crc32(in_out3, Address(in_out2, 0+4), 4);
7842       crc32(tmp1, Address(in_out2, size+4), 4);
7843       crc32(tmp2, Address(in_out2, size*2+4), 4);
7844       addl(in_out2, 8);
7845       cmpl(in_out2, tmp3);
7846       jcc(Assembler::less, L_processPartition);
7847 
7848         push(tmp3);
7849         push(in_out1);
7850         push(in_out2);
7851         tmp4 = tmp3;
7852         tmp5 = in_out1;
7853         n_tmp6 = in_out2;
7854 
7855       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7856             w_xtmp1, w_xtmp2, w_xtmp3,
7857             tmp4, tmp5,
7858             n_tmp6);
7859 
7860         pop(in_out2);
7861         pop(in_out1);
7862         pop(tmp3);
7863 
7864     addl(in_out2, 2 * size);
7865     subl(in_out1, 3 * size);
7866     jmp(L_processPartitions);
7867 
7868   bind(L_exit);
7869 }
7870 #endif //LP64
7871 
7872 #ifdef _LP64
7873 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7874 // Input: A buffer I of L bytes.
7875 // Output: the CRC32C value of the buffer.
7876 // Notations:
7877 // Write L = 24N + r, with N = floor (L/24).
7878 // r = L mod 24 (0 <= r < 24).
7879 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7880 // N quadwords, and R consists of r bytes.
7881 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7882 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7883 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7884 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7885 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7886                                           Register tmp1, Register tmp2, Register tmp3,
7887                                           Register tmp4, Register tmp5, Register tmp6,
7888                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7889                                           bool is_pclmulqdq_supported) {
7890   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7891   Label L_wordByWord;
7892   Label L_byteByByteProlog;
7893   Label L_byteByByte;
7894   Label L_exit;
7895 
7896   if (is_pclmulqdq_supported ) {
7897     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7898     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7899 
7900     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7901     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7902 
7903     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7904     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7905     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7906   } else {
7907     const_or_pre_comp_const_index[0] = 1;
7908     const_or_pre_comp_const_index[1] = 0;
7909 
7910     const_or_pre_comp_const_index[2] = 3;
7911     const_or_pre_comp_const_index[3] = 2;
7912 
7913     const_or_pre_comp_const_index[4] = 5;
7914     const_or_pre_comp_const_index[5] = 4;
7915    }
7916   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7917                     in2, in1, in_out,
7918                     tmp1, tmp2, tmp3,
7919                     w_xtmp1, w_xtmp2, w_xtmp3,
7920                     tmp4, tmp5,
7921                     tmp6);
7922   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7923                     in2, in1, in_out,
7924                     tmp1, tmp2, tmp3,
7925                     w_xtmp1, w_xtmp2, w_xtmp3,
7926                     tmp4, tmp5,
7927                     tmp6);
7928   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7929                     in2, in1, in_out,
7930                     tmp1, tmp2, tmp3,
7931                     w_xtmp1, w_xtmp2, w_xtmp3,
7932                     tmp4, tmp5,
7933                     tmp6);
7934   movl(tmp1, in2);
7935   andl(tmp1, 0x00000007);
7936   negl(tmp1);
7937   addl(tmp1, in2);
7938   addq(tmp1, in1);
7939 
7940   cmpq(in1, tmp1);
7941   jccb(Assembler::greaterEqual, L_byteByByteProlog);
7942   align(16);
7943   BIND(L_wordByWord);
7944     crc32(in_out, Address(in1, 0), 8);
7945     addq(in1, 8);
7946     cmpq(in1, tmp1);
7947     jcc(Assembler::less, L_wordByWord);
7948 
7949   BIND(L_byteByByteProlog);
7950   andl(in2, 0x00000007);
7951   movl(tmp2, 1);
7952 
7953   cmpl(tmp2, in2);
7954   jccb(Assembler::greater, L_exit);
7955   BIND(L_byteByByte);
7956     crc32(in_out, Address(in1, 0), 1);
7957     incq(in1);
7958     incl(tmp2);
7959     cmpl(tmp2, in2);
7960     jcc(Assembler::lessEqual, L_byteByByte);
7961 
7962   BIND(L_exit);
7963 }
7964 #else
7965 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7966                                           Register tmp1, Register  tmp2, Register tmp3,
7967                                           Register tmp4, Register  tmp5, Register tmp6,
7968                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7969                                           bool is_pclmulqdq_supported) {
7970   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7971   Label L_wordByWord;
7972   Label L_byteByByteProlog;
7973   Label L_byteByByte;
7974   Label L_exit;
7975 
7976   if (is_pclmulqdq_supported) {
7977     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7978     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
7979 
7980     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7981     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7982 
7983     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7984     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7985   } else {
7986     const_or_pre_comp_const_index[0] = 1;
7987     const_or_pre_comp_const_index[1] = 0;
7988 
7989     const_or_pre_comp_const_index[2] = 3;
7990     const_or_pre_comp_const_index[3] = 2;
7991 
7992     const_or_pre_comp_const_index[4] = 5;
7993     const_or_pre_comp_const_index[5] = 4;
7994   }
7995   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7996                     in2, in1, in_out,
7997                     tmp1, tmp2, tmp3,
7998                     w_xtmp1, w_xtmp2, w_xtmp3,
7999                     tmp4, tmp5,
8000                     tmp6);
8001   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8002                     in2, in1, in_out,
8003                     tmp1, tmp2, tmp3,
8004                     w_xtmp1, w_xtmp2, w_xtmp3,
8005                     tmp4, tmp5,
8006                     tmp6);
8007   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8008                     in2, in1, in_out,
8009                     tmp1, tmp2, tmp3,
8010                     w_xtmp1, w_xtmp2, w_xtmp3,
8011                     tmp4, tmp5,
8012                     tmp6);
8013   movl(tmp1, in2);
8014   andl(tmp1, 0x00000007);
8015   negl(tmp1);
8016   addl(tmp1, in2);
8017   addl(tmp1, in1);
8018 
8019   BIND(L_wordByWord);
8020   cmpl(in1, tmp1);
8021   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8022     crc32(in_out, Address(in1,0), 4);
8023     addl(in1, 4);
8024     jmp(L_wordByWord);
8025 
8026   BIND(L_byteByByteProlog);
8027   andl(in2, 0x00000007);
8028   movl(tmp2, 1);
8029 
8030   BIND(L_byteByByte);
8031   cmpl(tmp2, in2);
8032   jccb(Assembler::greater, L_exit);
8033     movb(tmp1, Address(in1, 0));
8034     crc32(in_out, tmp1, 1);
8035     incl(in1);
8036     incl(tmp2);
8037     jmp(L_byteByByte);
8038 
8039   BIND(L_exit);
8040 }
8041 #endif // LP64
8042 #undef BIND
8043 #undef BLOCK_COMMENT
8044 
8045 // Compress char[] array to byte[].
8046 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8047 //   @IntrinsicCandidate
8048 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8049 //     for (int i = 0; i < len; i++) {
8050 //       int c = src[srcOff++];
8051 //       if (c >>> 8 != 0) {
8052 //         return 0;
8053 //       }
8054 //       dst[dstOff++] = (byte)c;
8055 //     }
8056 //     return len;
8057 //   }
8058 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8059   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8060   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8061   Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8062   Label copy_chars_loop, return_length, return_zero, done;
8063 
8064   // rsi: src
8065   // rdi: dst
8066   // rdx: len
8067   // rcx: tmp5
8068   // rax: result
8069 
8070   // rsi holds start addr of source char[] to be compressed
8071   // rdi holds start addr of destination byte[]
8072   // rdx holds length
8073 
8074   assert(len != result, "");
8075 
8076   // save length for return
8077   push(len);
8078 
8079   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8080     VM_Version::supports_avx512vlbw() &&
8081     VM_Version::supports_bmi2()) {
8082 
8083     Label copy_32_loop, copy_loop_tail, below_threshold;
8084 
8085     // alignment
8086     Label post_alignment;
8087 
8088     // if length of the string is less than 16, handle it in an old fashioned way
8089     testl(len, -32);
8090     jcc(Assembler::zero, below_threshold);
8091 
8092     // First check whether a character is compressable ( <= 0xFF).
8093     // Create mask to test for Unicode chars inside zmm vector
8094     movl(result, 0x00FF);
8095     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
8096 
8097     testl(len, -64);
8098     jcc(Assembler::zero, post_alignment);
8099 
8100     movl(tmp5, dst);
8101     andl(tmp5, (32 - 1));
8102     negl(tmp5);
8103     andl(tmp5, (32 - 1));
8104 
8105     // bail out when there is nothing to be done
8106     testl(tmp5, 0xFFFFFFFF);
8107     jcc(Assembler::zero, post_alignment);
8108 
8109     // ~(~0 << len), where len is the # of remaining elements to process
8110     movl(result, 0xFFFFFFFF);
8111     shlxl(result, result, tmp5);
8112     notl(result);
8113     kmovdl(mask2, result);
8114 
8115     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8116     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8117     ktestd(mask1, mask2);
8118     jcc(Assembler::carryClear, return_zero);
8119 
8120     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8121 
8122     addptr(src, tmp5);
8123     addptr(src, tmp5);
8124     addptr(dst, tmp5);
8125     subl(len, tmp5);
8126 
8127     bind(post_alignment);
8128     // end of alignment
8129 
8130     movl(tmp5, len);
8131     andl(tmp5, (32 - 1));    // tail count (in chars)
8132     andl(len, ~(32 - 1));    // vector count (in chars)
8133     jcc(Assembler::zero, copy_loop_tail);
8134 
8135     lea(src, Address(src, len, Address::times_2));
8136     lea(dst, Address(dst, len, Address::times_1));
8137     negptr(len);
8138 
8139     bind(copy_32_loop);
8140     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit);
8141     evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8142     kortestdl(mask1, mask1);
8143     jcc(Assembler::carryClear, return_zero);
8144 
8145     // All elements in current processed chunk are valid candidates for
8146     // compression. Write a truncated byte elements to the memory.
8147     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8148     addptr(len, 32);
8149     jcc(Assembler::notZero, copy_32_loop);
8150 
8151     bind(copy_loop_tail);
8152     // bail out when there is nothing to be done
8153     testl(tmp5, 0xFFFFFFFF);
8154     jcc(Assembler::zero, return_length);
8155 
8156     movl(len, tmp5);
8157 
8158     // ~(~0 << len), where len is the # of remaining elements to process
8159     movl(result, 0xFFFFFFFF);
8160     shlxl(result, result, len);
8161     notl(result);
8162 
8163     kmovdl(mask2, result);
8164 
8165     evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
8166     evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
8167     ktestd(mask1, mask2);
8168     jcc(Assembler::carryClear, return_zero);
8169 
8170     evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
8171     jmp(return_length);
8172 
8173     bind(below_threshold);
8174   }
8175 
8176   if (UseSSE42Intrinsics) {
8177     Label copy_32_loop, copy_16, copy_tail;
8178 
8179     movl(result, len);
8180 
8181     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
8182 
8183     // vectored compression
8184     andl(len, 0xfffffff0);    // vector count (in chars)
8185     andl(result, 0x0000000f);    // tail count (in chars)
8186     testl(len, len);
8187     jcc(Assembler::zero, copy_16);
8188 
8189     // compress 16 chars per iter
8190     movdl(tmp1Reg, tmp5);
8191     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8192     pxor(tmp4Reg, tmp4Reg);
8193 
8194     lea(src, Address(src, len, Address::times_2));
8195     lea(dst, Address(dst, len, Address::times_1));
8196     negptr(len);
8197 
8198     bind(copy_32_loop);
8199     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
8200     por(tmp4Reg, tmp2Reg);
8201     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
8202     por(tmp4Reg, tmp3Reg);
8203     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
8204     jcc(Assembler::notZero, return_zero);
8205     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
8206     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
8207     addptr(len, 16);
8208     jcc(Assembler::notZero, copy_32_loop);
8209 
8210     // compress next vector of 8 chars (if any)
8211     bind(copy_16);
8212     movl(len, result);
8213     andl(len, 0xfffffff8);    // vector count (in chars)
8214     andl(result, 0x00000007);    // tail count (in chars)
8215     testl(len, len);
8216     jccb(Assembler::zero, copy_tail);
8217 
8218     movdl(tmp1Reg, tmp5);
8219     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
8220     pxor(tmp3Reg, tmp3Reg);
8221 
8222     movdqu(tmp2Reg, Address(src, 0));
8223     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
8224     jccb(Assembler::notZero, return_zero);
8225     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
8226     movq(Address(dst, 0), tmp2Reg);
8227     addptr(src, 16);
8228     addptr(dst, 8);
8229 
8230     bind(copy_tail);
8231     movl(len, result);
8232   }
8233   // compress 1 char per iter
8234   testl(len, len);
8235   jccb(Assembler::zero, return_length);
8236   lea(src, Address(src, len, Address::times_2));
8237   lea(dst, Address(dst, len, Address::times_1));
8238   negptr(len);
8239 
8240   bind(copy_chars_loop);
8241   load_unsigned_short(result, Address(src, len, Address::times_2));
8242   testl(result, 0xff00);      // check if Unicode char
8243   jccb(Assembler::notZero, return_zero);
8244   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
8245   increment(len);
8246   jcc(Assembler::notZero, copy_chars_loop);
8247 
8248   // if compression succeeded, return length
8249   bind(return_length);
8250   pop(result);
8251   jmpb(done);
8252 
8253   // if compression failed, return 0
8254   bind(return_zero);
8255   xorl(result, result);
8256   addptr(rsp, wordSize);
8257 
8258   bind(done);
8259 }
8260 
8261 // Inflate byte[] array to char[].
8262 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
8263 //   @IntrinsicCandidate
8264 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
8265 //     for (int i = 0; i < len; i++) {
8266 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
8267 //     }
8268 //   }
8269 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
8270   XMMRegister tmp1, Register tmp2, KRegister mask) {
8271   Label copy_chars_loop, done, below_threshold, avx3_threshold;
8272   // rsi: src
8273   // rdi: dst
8274   // rdx: len
8275   // rcx: tmp2
8276 
8277   // rsi holds start addr of source byte[] to be inflated
8278   // rdi holds start addr of destination char[]
8279   // rdx holds length
8280   assert_different_registers(src, dst, len, tmp2);
8281   movl(tmp2, len);
8282   if ((UseAVX > 2) && // AVX512
8283     VM_Version::supports_avx512vlbw() &&
8284     VM_Version::supports_bmi2()) {
8285 
8286     Label copy_32_loop, copy_tail;
8287     Register tmp3_aliased = len;
8288 
8289     // if length of the string is less than 16, handle it in an old fashioned way
8290     testl(len, -16);
8291     jcc(Assembler::zero, below_threshold);
8292 
8293     testl(len, -1 * AVX3Threshold);
8294     jcc(Assembler::zero, avx3_threshold);
8295 
8296     // In order to use only one arithmetic operation for the main loop we use
8297     // this pre-calculation
8298     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
8299     andl(len, -32);     // vector count
8300     jccb(Assembler::zero, copy_tail);
8301 
8302     lea(src, Address(src, len, Address::times_1));
8303     lea(dst, Address(dst, len, Address::times_2));
8304     negptr(len);
8305 
8306 
8307     // inflate 32 chars per iter
8308     bind(copy_32_loop);
8309     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
8310     evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit);
8311     addptr(len, 32);
8312     jcc(Assembler::notZero, copy_32_loop);
8313 
8314     bind(copy_tail);
8315     // bail out when there is nothing to be done
8316     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
8317     jcc(Assembler::zero, done);
8318 
8319     // ~(~0 << length), where length is the # of remaining elements to process
8320     movl(tmp3_aliased, -1);
8321     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
8322     notl(tmp3_aliased);
8323     kmovdl(mask, tmp3_aliased);
8324     evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
8325     evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
8326 
8327     jmp(done);
8328     bind(avx3_threshold);
8329   }
8330   if (UseSSE42Intrinsics) {
8331     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
8332 
8333     if (UseAVX > 1) {
8334       andl(tmp2, (16 - 1));
8335       andl(len, -16);
8336       jccb(Assembler::zero, copy_new_tail);
8337     } else {
8338       andl(tmp2, 0x00000007);   // tail count (in chars)
8339       andl(len, 0xfffffff8);    // vector count (in chars)
8340       jccb(Assembler::zero, copy_tail);
8341     }
8342 
8343     // vectored inflation
8344     lea(src, Address(src, len, Address::times_1));
8345     lea(dst, Address(dst, len, Address::times_2));
8346     negptr(len);
8347 
8348     if (UseAVX > 1) {
8349       bind(copy_16_loop);
8350       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
8351       vmovdqu(Address(dst, len, Address::times_2), tmp1);
8352       addptr(len, 16);
8353       jcc(Assembler::notZero, copy_16_loop);
8354 
8355       bind(below_threshold);
8356       bind(copy_new_tail);
8357       movl(len, tmp2);
8358       andl(tmp2, 0x00000007);
8359       andl(len, 0xFFFFFFF8);
8360       jccb(Assembler::zero, copy_tail);
8361 
8362       pmovzxbw(tmp1, Address(src, 0));
8363       movdqu(Address(dst, 0), tmp1);
8364       addptr(src, 8);
8365       addptr(dst, 2 * 8);
8366 
8367       jmp(copy_tail, true);
8368     }
8369 
8370     // inflate 8 chars per iter
8371     bind(copy_8_loop);
8372     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8373     movdqu(Address(dst, len, Address::times_2), tmp1);
8374     addptr(len, 8);
8375     jcc(Assembler::notZero, copy_8_loop);
8376 
8377     bind(copy_tail);
8378     movl(len, tmp2);
8379 
8380     cmpl(len, 4);
8381     jccb(Assembler::less, copy_bytes);
8382 
8383     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8384     pmovzxbw(tmp1, tmp1);
8385     movq(Address(dst, 0), tmp1);
8386     subptr(len, 4);
8387     addptr(src, 4);
8388     addptr(dst, 8);
8389 
8390     bind(copy_bytes);
8391   } else {
8392     bind(below_threshold);
8393   }
8394 
8395   testl(len, len);
8396   jccb(Assembler::zero, done);
8397   lea(src, Address(src, len, Address::times_1));
8398   lea(dst, Address(dst, len, Address::times_2));
8399   negptr(len);
8400 
8401   // inflate 1 char per iter
8402   bind(copy_chars_loop);
8403   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8404   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8405   increment(len);
8406   jcc(Assembler::notZero, copy_chars_loop);
8407 
8408   bind(done);
8409 }
8410 
8411 
8412 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
8413   switch(type) {
8414     case T_BYTE:
8415     case T_BOOLEAN:
8416       evmovdqub(dst, kmask, src, false, vector_len);
8417       break;
8418     case T_CHAR:
8419     case T_SHORT:
8420       evmovdquw(dst, kmask, src, false, vector_len);
8421       break;
8422     case T_INT:
8423     case T_FLOAT:
8424       evmovdqul(dst, kmask, src, false, vector_len);
8425       break;
8426     case T_LONG:
8427     case T_DOUBLE:
8428       evmovdquq(dst, kmask, src, false, vector_len);
8429       break;
8430     default:
8431       fatal("Unexpected type argument %s", type2name(type));
8432       break;
8433   }
8434 }
8435 
8436 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
8437   switch(type) {
8438     case T_BYTE:
8439     case T_BOOLEAN:
8440       evmovdqub(dst, kmask, src, true, vector_len);
8441       break;
8442     case T_CHAR:
8443     case T_SHORT:
8444       evmovdquw(dst, kmask, src, true, vector_len);
8445       break;
8446     case T_INT:
8447     case T_FLOAT:
8448       evmovdqul(dst, kmask, src, true, vector_len);
8449       break;
8450     case T_LONG:
8451     case T_DOUBLE:
8452       evmovdquq(dst, kmask, src, true, vector_len);
8453       break;
8454     default:
8455       fatal("Unexpected type argument %s", type2name(type));
8456       break;
8457   }
8458 }
8459 
8460 #if COMPILER2_OR_JVMCI
8461 
8462 
8463 // Set memory operation for length "less than" 64 bytes.
8464 void MacroAssembler::fill64_masked_avx(uint shift, Register dst, int disp,
8465                                        XMMRegister xmm, KRegister mask, Register length,
8466                                        Register temp, bool use64byteVector) {
8467   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8468   assert(shift != 0, "shift value should be 1 (short),2(int) or 3(long)");
8469   BasicType type[] = { T_BYTE, T_SHORT,  T_INT,   T_LONG};
8470   if (!use64byteVector) {
8471     fill32_avx(dst, disp, xmm);
8472     subptr(length, 32 >> shift);
8473     fill32_masked_avx(shift, dst, disp + 32, xmm, mask, length, temp);
8474   } else {
8475     assert(MaxVectorSize == 64, "vector length != 64");
8476     movl(temp, 1);
8477     shlxl(temp, temp, length);
8478     subptr(temp, 1);
8479     kmovwl(mask, temp);
8480     evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_512bit);
8481   }
8482 }
8483 
8484 
8485 void MacroAssembler::fill32_masked_avx(uint shift, Register dst, int disp,
8486                                        XMMRegister xmm, KRegister mask, Register length,
8487                                        Register temp) {
8488   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8489   assert(shift != 0, "shift value should be 1 (short), 2(int) or 3(long)");
8490   BasicType type[] = { T_BYTE, T_SHORT,  T_INT,   T_LONG};
8491   movl(temp, 1);
8492   shlxl(temp, temp, length);
8493   subptr(temp, 1);
8494   kmovwl(mask, temp);
8495   evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_256bit);
8496 }
8497 
8498 void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
8499   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8500   vmovdqu(dst, xmm);
8501 }
8502 
8503 void MacroAssembler::fill32_avx(Register dst, int disp, XMMRegister xmm) {
8504   fill32(Address(dst, disp), xmm);
8505 }
8506 
8507 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
8508   assert(MaxVectorSize >= 32, "vector length should be >= 32");
8509   if (!use64byteVector) {
8510     fill32(dst, xmm);
8511     fill32(dst.plus_disp(32), xmm);
8512   } else {
8513     evmovdquq(dst, xmm, Assembler::AVX_512bit);
8514   }
8515 }
8516 
8517 void MacroAssembler::fill64_avx(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
8518   fill64(Address(dst, disp), xmm, use64byteVector);
8519 }
8520 
8521 #endif //COMPILER2_OR_JVMCI
8522 
8523 
8524 #ifdef _LP64
8525 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
8526   Label done;
8527   cvttss2sil(dst, src);
8528   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8529   cmpl(dst, 0x80000000); // float_sign_flip
8530   jccb(Assembler::notEqual, done);
8531   subptr(rsp, 8);
8532   movflt(Address(rsp, 0), src);
8533   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
8534   pop(dst);
8535   bind(done);
8536 }
8537 
8538 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
8539   Label done;
8540   cvttsd2sil(dst, src);
8541   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8542   cmpl(dst, 0x80000000); // float_sign_flip
8543   jccb(Assembler::notEqual, done);
8544   subptr(rsp, 8);
8545   movdbl(Address(rsp, 0), src);
8546   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
8547   pop(dst);
8548   bind(done);
8549 }
8550 
8551 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
8552   Label done;
8553   cvttss2siq(dst, src);
8554   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8555   jccb(Assembler::notEqual, done);
8556   subptr(rsp, 8);
8557   movflt(Address(rsp, 0), src);
8558   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
8559   pop(dst);
8560   bind(done);
8561 }
8562 
8563 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
8564   Label done;
8565   cvttsd2siq(dst, src);
8566   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8567   jccb(Assembler::notEqual, done);
8568   subptr(rsp, 8);
8569   movdbl(Address(rsp, 0), src);
8570   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
8571   pop(dst);
8572   bind(done);
8573 }
8574 
8575 void MacroAssembler::cache_wb(Address line)
8576 {
8577   // 64 bit cpus always support clflush
8578   assert(VM_Version::supports_clflush(), "clflush should be available");
8579   bool optimized = VM_Version::supports_clflushopt();
8580   bool no_evict = VM_Version::supports_clwb();
8581 
8582   // prefer clwb (writeback without evict) otherwise
8583   // prefer clflushopt (potentially parallel writeback with evict)
8584   // otherwise fallback on clflush (serial writeback with evict)
8585 
8586   if (optimized) {
8587     if (no_evict) {
8588       clwb(line);
8589     } else {
8590       clflushopt(line);
8591     }
8592   } else {
8593     // no need for fence when using CLFLUSH
8594     clflush(line);
8595   }
8596 }
8597 
8598 void MacroAssembler::cache_wbsync(bool is_pre)
8599 {
8600   assert(VM_Version::supports_clflush(), "clflush should be available");
8601   bool optimized = VM_Version::supports_clflushopt();
8602   bool no_evict = VM_Version::supports_clwb();
8603 
8604   // pick the correct implementation
8605 
8606   if (!is_pre && (optimized || no_evict)) {
8607     // need an sfence for post flush when using clflushopt or clwb
8608     // otherwise no no need for any synchroniaztion
8609 
8610     sfence();
8611   }
8612 }
8613 
8614 #endif // _LP64
8615 
8616 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
8617   switch (cond) {
8618     // Note some conditions are synonyms for others
8619     case Assembler::zero:         return Assembler::notZero;
8620     case Assembler::notZero:      return Assembler::zero;
8621     case Assembler::less:         return Assembler::greaterEqual;
8622     case Assembler::lessEqual:    return Assembler::greater;
8623     case Assembler::greater:      return Assembler::lessEqual;
8624     case Assembler::greaterEqual: return Assembler::less;
8625     case Assembler::below:        return Assembler::aboveEqual;
8626     case Assembler::belowEqual:   return Assembler::above;
8627     case Assembler::above:        return Assembler::belowEqual;
8628     case Assembler::aboveEqual:   return Assembler::below;
8629     case Assembler::overflow:     return Assembler::noOverflow;
8630     case Assembler::noOverflow:   return Assembler::overflow;
8631     case Assembler::negative:     return Assembler::positive;
8632     case Assembler::positive:     return Assembler::negative;
8633     case Assembler::parity:       return Assembler::noParity;
8634     case Assembler::noParity:     return Assembler::parity;
8635   }
8636   ShouldNotReachHere(); return Assembler::overflow;
8637 }
8638 
8639 SkipIfEqual::SkipIfEqual(
8640     MacroAssembler* masm, const bool* flag_addr, bool value) {
8641   _masm = masm;
8642   _masm->cmp8(ExternalAddress((address)flag_addr), value);
8643   _masm->jcc(Assembler::equal, _label);
8644 }
8645 
8646 SkipIfEqual::~SkipIfEqual() {
8647   _masm->bind(_label);
8648 }
8649 
8650 // 32-bit Windows has its own fast-path implementation
8651 // of get_thread
8652 #if !defined(WIN32) || defined(_LP64)
8653 
8654 // This is simply a call to Thread::current()
8655 void MacroAssembler::get_thread(Register thread) {
8656   if (thread != rax) {
8657     push(rax);
8658   }
8659   LP64_ONLY(push(rdi);)
8660   LP64_ONLY(push(rsi);)
8661   push(rdx);
8662   push(rcx);
8663 #ifdef _LP64
8664   push(r8);
8665   push(r9);
8666   push(r10);
8667   push(r11);
8668 #endif
8669 
8670   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
8671 
8672 #ifdef _LP64
8673   pop(r11);
8674   pop(r10);
8675   pop(r9);
8676   pop(r8);
8677 #endif
8678   pop(rcx);
8679   pop(rdx);
8680   LP64_ONLY(pop(rsi);)
8681   LP64_ONLY(pop(rdi);)
8682   if (thread != rax) {
8683     mov(thread, rax);
8684     pop(rax);
8685   }
8686 }
8687 
8688 #endif // !WIN32 || _LP64